require "nokogiri" require "htmlentities" require "pp" module MathML2AsciiMath def self.m2a(xml) normalized = xml # &:noblanks skips non-significant whitespaces in MathML docxml = Nokogiri::XML.parse(normalized, &:noblanks) # Get rid of things like # <mtext> </mtext> parse(docxml.root).gsub(/[[:blank:]]/, " ").unicode_normalize.squeeze(" ") end def self.encodechars(xml) xml.gsub(/\u03b1/, "alpha") .gsub(/\u03b2/, "beta") .gsub(/\u03b3/, "gamma") .gsub(/\u0393/, "Gamma") .gsub(/\u03b4/, "delta") .gsub(/\u0394/, "Delta") .gsub(/\u2206/, "Delta") .gsub(/\u03b5/, "epsilon") .gsub(/\u025b/, "varepsilon") .gsub(/\u03b6/, "zeta") .gsub(/\u03b7/, "eta") .gsub(/\u03b8/, "theta") .gsub(/\u0398/, "Theta") .gsub(/\u03d1/, "vartheta") .gsub(/\u03b9/, "iota") .gsub(/\u03ba/, "kappa") .gsub(/\u03bb/, "lambda") .gsub(/\u039b/, "Lambda") .gsub(/\u03bc/, "mu") .gsub(/\u03bd/, "nu") .gsub(/\u03be/, "xi") .gsub(/\u039e/, "Xi") .gsub(/\u03c0/, "pi") .gsub(/\u03a0/, "Pi") .gsub(/\u03c1/, "rho") .gsub(/\u03c2/, "beta") .gsub(/\u03c3/, "sigma") .gsub(/\u03a3/, "Sigma") .gsub(/\u03c4/, "tau") .gsub(/\u03c5/, "upsilon") .gsub(/\u03c6/, "phi") .gsub(/\u03a6/, "Phi") .gsub(/\u03d5/, "varphi") .gsub(/\u03c7/, "chi") .gsub(/\u03c8/, "psi") .gsub(/\u03a8/, "Psi") .gsub(/\u03c9/, "omega") .gsub(/\u03a9/, "omega") .gsub(/\u22c5/, "*") .gsub(/\u2219/, "*") .gsub(/\u00b7/, "*") .gsub(/\u2217/, "**") .gsub(/\u22c6/, "***") .gsub(/\//, "//") .gsub(/\\/, "\\\\") .gsub(/\u00d7/, "xx") .gsub(/\u22c9/, "|><") .gsub(/\u22ca/, "><|") .gsub(/\u22c8/, "|><|") .gsub(/\u00f7/, "-:") .gsub(/\u2218/, "@") .gsub(/\u2295/, "o+") .gsub(/\u2a01/, "o+") .gsub(/\u2297/, "ox") .gsub(/\u2299/, "o.") .gsub(/\u2211/, "sum") .gsub(/\u220f/, "prod") .gsub(/\u2227/, "^^") .gsub(/\u22c0/, "^^^") .gsub(/\u2228/, "vv") .gsub(/\u22c1/, "vvv") .gsub(/\u2229/, "nn") .gsub(/\u22c2/, "nnn") .gsub(/\u222a/, "uu") .gsub(/\u22c3/, "uuu") .gsub(/\u2260/, "!=") .gsub(/\u2264/, "<=") .gsub(/\u2265/, ">=") .gsub(/\u227a/, "-<") .gsub(/\u227b/, ">-") .gsub(/\u2aaf/, "-<=") .gsub(/\u2ab0/, ">-=") .gsub(/\u2208/, "in") .gsub(/\u2209/, "!in") .gsub(/\u2282/, "sub") .gsub(/\u2283/, "sup") .gsub(/\u2286/, "sube") .gsub(/\u2287/, "supe") .gsub(/\u2261/, "-=") .gsub(/\u2245/, "~=") .gsub(/\u2248/, "~~") .gsub(/\u221d/, "prop") .gsub(/\u00ac/, "not") .gsub(/\u21d2/, "=>") .gsub(/\u21d4/, "<=>") .gsub(/\u2200/, "AA") .gsub(/\u2203/, "EE") .gsub(/\u22a5/, "_|_") .gsub(/\u22a4/, "TT") .gsub(/\u22a2/, "|--") .gsub(/\u22a8/, "|==") .gsub(/\u22a8/, "|==") .gsub(/\u2329/, "(:") .gsub(/\u232a/, ":)") .gsub(/\u2329/, "<<") .gsub(/\u27e8/, "<<") .gsub(/\u232a/, ">>") .gsub(/\u27e9/, ">>") .gsub(/\u222b/, "int") .gsub(/\u222e/, "oint") .gsub(/\u2202/, "del") .gsub(/\u2207/, "grad") .gsub(/\u00b1/, "+-") .gsub(/\u2205/, "O/") .gsub(/\u221e/, "oo") .gsub(/\u2135/, "aleph") .gsub(/\u2234/, ":.") .gsub(/\u2235/, ":'") .gsub(/\u2220/, "/_") .gsub(/\u25b3/, "/_\\") .gsub(/\u2032/, "'") .gsub(/~/, "tilde") .gsub(/\u00a0\u00a0\u00a0\u00a0/, "qquad") .gsub(/\u00a0\u00a0/, "quad") .gsub(/\u00a0/, "\\ ") .gsub(/\u2322/, "frown") .gsub(/\u00a0/, "quad") .gsub(/\u22ef/, "cdots") .gsub(/\u22ee/, "vdots") .gsub(/\u22f1/, "ddots") .gsub(/\u22c4/, "diamond") .gsub(/\u25a1/, "square") .gsub(/\u230a/, "|__") .gsub(/\u230b/, "__|") .gsub(/\u2308/, "|~") .gsub(/\u2309/, "~|") .gsub(/\u2102/, "CC") .gsub(/\u2115/, "NN") .gsub(/\u211a/, "QQ") .gsub(/\u211d/, "RR") .gsub(/\u2124/, "ZZ") .gsub(/\u2191/, "uarr") .gsub(/\u2193/, "darr") .gsub(/\u2190/, "larr") .gsub(/\u2194/, "harr") .gsub(/\u21d2/, "rArr") .gsub(/\u21d0/, "lArr") .gsub(/\u21d4/, "hArr") .gsub(/\u2192/, "->") .gsub(/\u21a3/, ">->") .gsub(/\u21a0/, "->>") .gsub(/\u2916/, ">->>") .gsub(/\u21a6/, "|->") .gsub(/\u2026/, "...") .gsub(/\u2212/, "-") .gsub(/\u2061/, "") # function application .gsub(/\u2751/, "square") .gsub(/[\u2028\u2029]/, " ") # normalize thin spaces like \u2009, \u2008 end def self.join_parsed_children(children, delimiter = " ") children.map do |n| parse(n).strip end.join(delimiter) end def self.parse(node) out = "" if node.text? return encodechars(HTMLEntities.new.decode(node.text)) end case node.name.sub(/^[^:]*:/, "") when "math" join_parsed_children(node.elements) when "annotation" "" when "semantics" join_parsed_children(node.elements) when "mrow" out = join_parsed_children(node.elements) if %w[mfrac msub munder munderover] .include? node.parent.name.sub(/^[^:]*:/, "") out = "(#{out})" end out when "mfenced" sym_open = node["open"] || "(" sym_close = node["close"] || ")" separator = "," # TODO currently ignore the supplied separators out = join_parsed_children(node.elements, separator) "#{sym_open}#{out}#{sym_close}" when "msqrt" "sqrt(#{join_parsed_children(node.elements)})" when "mfrac" "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})" when "msup" sup = parse(node.elements[1]) sup = "(#{sup})" unless sup.length == 1 op = parse(node.elements[0]).gsub(/ $/, "") "#{op}^#{sup}" when "msub" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 op = parse(node.elements[0]).gsub(/ $/, "") "#{op}_#{sub}" when "munderover", "msubsup" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 sup = parse(node.elements[2]) sup = "(#{sup})" unless sup.length == 1 op = parse(node.elements[0]).gsub(/ $/, "") "#{op}_#{sub}^#{sup}" when "munder" elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u0332" then "ul" when "\u23df" then "ubrace" else "underset" end if accent == "underset" "underset(#{elem1})(#{parse(node.elements[0])})" else "#{accent} #{parse(node.elements[0])}" end when "mover" elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u005e" then "hat" when "\u00af" then "bar" # when "\u2192" then "vec" when "->" then "vec" when "." then "dot" when ".." then "ddot" when "\u23de" then "obrace" else "overset" end if accent == "overset" "overset(#{elem1})(#{parse(node.elements[0])})" else "#{accent} #{parse(node.elements[0])}" end when "mtable" "[#{join_parsed_children(node.elements, ',')}]" when "mtr" "[#{join_parsed_children(node.elements, ',')}]" when "mtd" join_parsed_children(node.elements, ",") when "mn", "mtext" join_parsed_children(node.children, "") when "mi" # FIXME: What does this comment have to do with Word? # mi is not meant to have space around it, # but Word is conflating operators and operands join_parsed_children(node.children) # FIXME: Why do we need to add extra spaces? # out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out when "mo" out = join_parsed_children(node.children) out = " #{out} " unless node["fence"] out when "mstyle" join_parsed_children(node.children) else "<math xmlns=\"http://www.w3.org/1998/Math/MathML\">" + node.to_xml + "</math>" end end end