require "nokogiri" require "htmlentities" require "pp" module MathML2AsciiMath def self.m2a(x) docxml = Nokogiri::XML(x) parse(docxml.root).gsub(/ /, " "). sub(/^\s+/, ""). sub(/\s+$/, "") end def self.encodechars(x) x.gsub(/\u03b1/, "alpha"). gsub(/\u03b2/, "beta"). gsub(/\u03b3/, "gamma"). gsub(/\u0393/, "Gamma"). gsub(/\u03b4/, "delta"). gsub(/\u0394/, "Delta"). gsub(/\u2206/, "Delta"). gsub(/\u03b5/, "epsilon"). gsub(/\u025b/, "varepsilon"). gsub(/\u03b6/, "zeta"). gsub(/\u03b7/, "eta"). gsub(/\u03b8/, "theta"). gsub(/\u0398/, "Theta"). gsub(/\u03d1/, "vartheta"). gsub(/\u03b9/, "iota"). gsub(/\u03ba/, "kappa"). gsub(/\u03bb/, "lambda"). gsub(/\u039b/, "Lambda"). gsub(/\u03bc/, "mu"). gsub(/\u03bd/, "nu"). gsub(/\u03be/, "xi"). gsub(/\u039e/, "Xi"). gsub(/\u03c0/, "pi"). gsub(/\u03a0/, "Pi"). gsub(/\u03c1/, "rho"). gsub(/\u03c2/, "beta"). gsub(/\u03c3/, "sigma"). gsub(/\u03a3/, "Sigma"). gsub(/\u03c4/, "tau"). gsub(/\u03c5/, "upsilon"). gsub(/\u03c6/, "phi"). gsub(/\u03a6/, "Phi"). gsub(/\u03d5/, "varphi"). gsub(/\u03c7/, "chi"). gsub(/\u03c8/, "psi"). gsub(/\u03a8/, "Psi"). gsub(/\u03c9/, "omega"). gsub(/\u03a9/, "omega"). gsub(/\u22c5/, "*"). gsub(/\u2219/, "*"). gsub(/\u00b7/, "*"). gsub(/\u2217/, "**"). gsub(/\u22c6/, "***"). gsub(/\//, "//"). gsub(/\\/, "\\\\"). gsub(/\u00d7/, "xx"). gsub(/\u22c9/, "|><"). gsub(/\u22ca/, "><|"). gsub(/\u22c8/, "|><|"). gsub(/\u00f7/, "-:"). gsub(/\u2218/, "@"). gsub(/\u2295/, "o+"). gsub(/\u2a01/, "o+"). gsub(/\u2297/, "ox"). gsub(/\u2299/, "o."). gsub(/\u2211/, "sum"). gsub(/\u220f/, "prod"). gsub(/\u2227/, "^^"). gsub(/\u22c0/, "^^^"). gsub(/\u2228/, "vv"). gsub(/\u22c1/, "vvv"). gsub(/\u2229/, "nn"). gsub(/\u22c2/, "nnn"). gsub(/\u222a/, "uu"). gsub(/\u22c3/, "uuu"). gsub(/\u2260/, "!="). gsub(/\u2264/, "<="). gsub(/\u2265/, ">="). gsub(/\u227a/, "-<"). gsub(/\u227b/, ">-"). gsub(/\u2aaf/, "-<="). gsub(/\u2ab0/, ">-="). gsub(/\u2208/, "in"). gsub(/\u2209/, "!in"). gsub(/\u2282/, "sub"). gsub(/\u2283/, "sup"). gsub(/\u2286/, "sube"). gsub(/\u2287/, "supe"). gsub(/\u2261/, "-="). gsub(/\u2245/, "~="). gsub(/\u2248/, "~~"). gsub(/\u221d/, "prop"). gsub(/\u00ac/, "not"). gsub(/\u21d2/, "=>"). gsub(/\u21d4/, "<=>"). gsub(/\u2200/, "AA"). gsub(/\u2203/, "EE"). gsub(/\u22a5/, "_|_"). gsub(/\u22a4/, "TT"). gsub(/\u22a2/, "|--"). gsub(/\u22a8/, "|=="). gsub(/\u22a8/, "|=="). gsub(/\u2329/, "(:"). gsub(/\u232a/, ":)"). gsub(/\u2329/, "<<"). gsub(/\u27e8/, "<<"). gsub(/\u232a/, ">>"). gsub(/\u27e9/, ">>"). gsub(/\u222e/, "oint"). gsub(/\u2202/, "del"). gsub(/\u2207/, "grad"). gsub(/\u00b1/, "+-"). gsub(/\u2205/, "O/"). gsub(/\u221e/, "oo"). gsub(/\u2135/, "aleph"). gsub(/\u2234/, ":."). gsub(/\u2235/, ":'"). gsub(/\u2220/, "/_"). gsub(/\u25b3/, "/_\\"). gsub(/\u2032/, "'"). gsub(/~/, "tilde"). gsub(/\u00a0\u00a0\u00a0\u00a0/, "qquad"). gsub(/\u00a0\u00a0/, "quad"). gsub(/\u00a0/, "\\ "). gsub(/\u2322/, "frown"). gsub(/\u00a0/, "quad"). gsub(/\u22ef/, "cdots"). gsub(/\u22ee/, "vdots"). gsub(/\u22f1/, "ddots"). gsub(/\u22c4/, "diamond"). gsub(/\u25a1/, "square"). gsub(/\u230a/, "|__"). gsub(/\u230b/, "__|"). gsub(/\u2308/, "|~"). gsub(/\u2309/, "~|"). gsub(/\u2102/, "CC"). gsub(/\u2115/, "NN"). gsub(/\u211a/, "QQ"). gsub(/\u211d/, "RR"). gsub(/\u2124/, "ZZ"). gsub(/\u2191/, "uarr"). gsub(/\u2193/, "darr"). gsub(/\u2190/, "larr"). gsub(/\u2194/, "harr"). gsub(/\u21d2/, "rArr"). gsub(/\u21d0/, "lArr"). gsub(/\u21d4/, "hArr"). gsub(/\u2192/, "->"). gsub(/\u21a3/, ">->"). gsub(/\u21a0/, "->>"). gsub(/\u2916/, ">->>"). gsub(/\u21a6/, "|->"). gsub(/\u2026/, "..."). gsub(/\u2212/, "-"). gsub(/\u2061/, ""). # function application gsub(/\u2751/, "square") end def self.parse(node) out = "" if node.text? return encodechars(HTMLEntities.new.decode(node.text)) end case node.name.sub(/^[^:]*:/, "") when "math" outarr = [] node.elements.each { |n| outarr << parse(n).strip } return outarr.join(" ") when "annotation" return "" when "semantics" outarr = [] node.elements.each { |n| outarr << parse(n).strip } return outarr.join(" ") when "mrow" outarr = [] node.children.each { |n| outarr << parse(n).strip } out = outarr.join(" ") if %w{mfrac msub munder munderover}.include? node.parent.name.sub(/^[^:]*:/, "") out = "(#{out})" end return out when "mfenced" outarr = [] open = node["open"] || "(" close = node["close"] || ")" separator = "," # TODO currently ignore the supplied separators node.elements.each { |n| outarr << parse(n).strip } out = outarr.join(separator) return "#{open}#{out}#{close}" when "msqrt" outarr = [] node.elements.each { |n| outarr << parse(n).strip } return "sqrt(#{outarr.join(" ")})" when "mfrac" return "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})" when "msup" sup = parse(node.elements[1]) sup = "(#{sup})" unless sup.length == 1 op = parse(node.elements[0]).gsub(/ $/, "") return "#{op}^#{sup}" when "msub" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 op = parse(node.elements[0]).gsub(/ $/, "") return "#{op}_#{sub}" when "munderover", "msubsup" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 sup = parse(node.elements[2]) sup = "(#{sup})" unless sup.length == 1 op = parse(node.elements[0]).gsub(/ $/, "") return "#{op}_#{sub}^#{sup}" when "munder" elem1 = parse(node.elements[1]).sub(/^\s+/, "").sub(/\s+$/, "") accent = case elem1 when "\u0332" then "ul" when "\u23df" then "ubrace" else "underset" end if accent == "underset" return "underset(#{elem1})(#{parse(node.elements[0])})" else return "#{accent} #{parse(node.elements[0])}" end when "mover" elem1 = parse(node.elements[1]).sub(/^\s+/, "").sub(/\s+$/, "") accent = case elem1 when "\u005e" then "hat" when "\u00af" then "bar" #when "\u2192" then "vec" when "->" then "vec" when "." then "dot" when ".." then "ddot" when "\u23de" then "obrace" else "overset" end if accent == "overset" return "overset(#{elem1})(#{parse(node.elements[0])})" else return "#{accent} #{parse(node.elements[0])}" end when "mtable" outarr = [] node.elements.each { |n| outarr << parse(n).strip } return "[#{outarr.join(",")}]" when "mtr" outarr = [] node.elements.each { |n| outarr << parse(n).strip } return "[#{outarr.join(",")}]" when "mtd" outarr = [] node.elements.each { |n| outarr << parse(n).strip } return "#{outarr.join(",")}" when "mn", "mtext" outarr = [] node.children.each { |n| outarr << parse(n).strip } return "#{outarr.join("")}" when "mi" # mi is not meant to have space around it, but Word is conflating operators and operands outarr = [] node.children.each { |n| outarr << parse(n).strip } out = outarr.join(" ") out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out return out when "mo" outarr = [] node.children.each { |n| outarr << parse(n).strip } out = outarr.join(" ") out = " #{out} " unless node["fence"] return out when "mstyle" outarr = [] node.children.each { |n| outarr << parse(n).strip } out = outarr.join(" ") else node.to_xml end end end