require "nokogiri" require "htmlentities" require "pp" module MathML2AsciiMath def self.m2a(x) normalized = x # &:noblanks skips non-significant whitespaces in MathML docxml = Nokogiri::XML.parse(normalized, &:noblanks) # Get rid of things like # <mtext> </mtext> parse(docxml.root).gsub(/[[:blank:]]/, ' ').unicode_normalize.squeeze(' ') end def self.encodechars(x) x.gsub(/\u03b1/, 'alpha'). gsub(/\u03b2/, 'beta'). gsub(/\u03b3/, 'gamma'). gsub(/\u0393/, 'Gamma'). gsub(/\u03b4/, 'delta'). gsub(/\u0394/, 'Delta'). gsub(/\u2206/, 'Delta'). gsub(/\u03b5/, 'epsilon'). gsub(/\u025b/, 'varepsilon'). gsub(/\u03b6/, 'zeta'). gsub(/\u03b7/, 'eta'). gsub(/\u03b8/, 'theta'). gsub(/\u0398/, 'Theta'). gsub(/\u03d1/, 'vartheta'). gsub(/\u03b9/, 'iota'). gsub(/\u03ba/, 'kappa'). gsub(/\u03bb/, 'lambda'). gsub(/\u039b/, 'Lambda'). gsub(/\u03bc/, 'mu'). gsub(/\u03bd/, 'nu'). gsub(/\u03be/, 'xi'). gsub(/\u039e/, 'Xi'). gsub(/\u03c0/, 'pi'). gsub(/\u03a0/, 'Pi'). gsub(/\u03c1/, 'rho'). gsub(/\u03c2/, 'beta'). gsub(/\u03c3/, 'sigma'). gsub(/\u03a3/, 'Sigma'). gsub(/\u03c4/, 'tau'). gsub(/\u03c5/, 'upsilon'). gsub(/\u03c6/, 'phi'). gsub(/\u03a6/, 'Phi'). gsub(/\u03d5/, 'varphi'). gsub(/\u03c7/, 'chi'). gsub(/\u03c8/, 'psi'). gsub(/\u03a8/, 'Psi'). gsub(/\u03c9/, 'omega'). gsub(/\u03a9/, 'omega'). gsub(/\u22c5/, '*'). gsub(/\u2219/, '*'). gsub(/\u00b7/, '*'). gsub(/\u2217/, '**'). gsub(/\u22c6/, '***'). gsub(/\//, '//'). gsub(/\\/, "\\\\"). gsub(/\u00d7/, 'xx'). gsub(/\u22c9/, '|><'). gsub(/\u22ca/, '><|'). gsub(/\u22c8/, '|><|'). gsub(/\u00f7/, '-:'). gsub(/\u2218/, '@'). gsub(/\u2295/, 'o+'). gsub(/\u2a01/, 'o+'). gsub(/\u2297/, 'ox'). gsub(/\u2299/, 'o.'). gsub(/\u2211/, 'sum'). gsub(/\u220f/, 'prod'). gsub(/\u2227/, '^^'). gsub(/\u22c0/, '^^^'). gsub(/\u2228/, 'vv'). gsub(/\u22c1/, 'vvv'). gsub(/\u2229/, 'nn'). gsub(/\u22c2/, 'nnn'). gsub(/\u222a/, 'uu'). gsub(/\u22c3/, 'uuu'). gsub(/\u2260/, '!='). gsub(/\u2264/, '<='). gsub(/\u2265/, '>='). gsub(/\u227a/, '-<'). gsub(/\u227b/, '>-'). gsub(/\u2aaf/, '-<='). gsub(/\u2ab0/, '>-='). gsub(/\u2208/, 'in'). gsub(/\u2209/, '!in'). gsub(/\u2282/, 'sub'). gsub(/\u2283/, 'sup'). gsub(/\u2286/, 'sube'). gsub(/\u2287/, 'supe'). gsub(/\u2261/, '-='). gsub(/\u2245/, '~='). gsub(/\u2248/, '~~'). gsub(/\u221d/, 'prop'). gsub(/\u00ac/, 'not'). gsub(/\u21d2/, '=>'). gsub(/\u21d4/, '<=>'). gsub(/\u2200/, 'AA'). gsub(/\u2203/, 'EE'). gsub(/\u22a5/, '_|_'). gsub(/\u22a4/, 'TT'). gsub(/\u22a2/, '|--'). gsub(/\u22a8/, '|=='). gsub(/\u22a8/, '|=='). gsub(/\u2329/, '(:'). gsub(/\u232a/, ':)'). gsub(/\u2329/, '<<'). gsub(/\u27e8/, '<<'). gsub(/\u232a/, '>>'). gsub(/\u27e9/, '>>'). gsub(/\u222b/, 'int'). gsub(/\u222e/, 'oint'). gsub(/\u2202/, 'del'). gsub(/\u2207/, 'grad'). gsub(/\u00b1/, '+-'). gsub(/\u2205/, "O/"). gsub(/\u221e/, 'oo'). gsub(/\u2135/, 'aleph'). gsub(/\u2234/, ':.'). gsub(/\u2235/, ":'"). gsub(/\u2220/, "/_"). gsub(/\u25b3/, "/_\\"). gsub(/\u2032/, "'"). gsub(/~/, 'tilde'). gsub(/\u00a0\u00a0\u00a0\u00a0/, 'qquad'). gsub(/\u00a0\u00a0/, 'quad'). gsub(/\u00a0/, "\\ "). gsub(/\u2322/, 'frown'). gsub(/\u00a0/, 'quad'). gsub(/\u22ef/, 'cdots'). gsub(/\u22ee/, 'vdots'). gsub(/\u22f1/, 'ddots'). gsub(/\u22c4/, 'diamond'). gsub(/\u25a1/, 'square'). gsub(/\u230a/, '|__'). gsub(/\u230b/, '__|'). gsub(/\u2308/, '|~'). gsub(/\u2309/, '~|'). gsub(/\u2102/, 'CC'). gsub(/\u2115/, 'NN'). gsub(/\u211a/, 'QQ'). gsub(/\u211d/, 'RR'). gsub(/\u2124/, 'ZZ'). gsub(/\u2191/, 'uarr'). gsub(/\u2193/, 'darr'). gsub(/\u2190/, 'larr'). gsub(/\u2194/, 'harr'). gsub(/\u21d2/, 'rArr'). gsub(/\u21d0/, 'lArr'). gsub(/\u21d4/, 'hArr'). gsub(/\u2192/, '->'). gsub(/\u21a3/, '>->'). gsub(/\u21a0/, '->>'). gsub(/\u2916/, '>->>'). gsub(/\u21a6/, '|->'). gsub(/\u2026/, '...'). gsub(/\u2212/, '-'). gsub(/\u2061/, ''). # function application gsub(/\u2751/, 'square'). gsub(/[\u2028\u2029]/, ' ') # normalize thin spaces like \u2009, \u2008 end def self.join_parsed_children(children, delimiter=' ') children.map do |n| parse(n).strip end.join(delimiter) end def self.parse(node) out = '' if node.text? return encodechars(HTMLEntities.new.decode(node.text)) end case node.name.sub(/^[^:]*:/, '') when "math" return join_parsed_children(node.elements) when "annotation" return '' when "semantics" return join_parsed_children(node.elements) when "mrow" out = join_parsed_children(node.elements) if %w[mfrac msub munder munderover].include? node.parent.name.sub(/^[^:]*:/, '') out = "(#{out})" end return out when "mfenced" sym_open = node["open"] || "(" sym_close = node["close"] || ")" separator = "," # TODO currently ignore the supplied separators out = join_parsed_children(node.elements, separator) return "#{sym_open}#{out}#{sym_close}" when "msqrt" return "sqrt(#{join_parsed_children(node.elements)})" when "mfrac" return "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})" when "msup" sup = parse(node.elements[1]) sup = "(#{sup})" unless sup.length == 1 op = parse(node.elements[0]).gsub(/ $/, '') return "#{op}^#{sup}" when "msub" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 op = parse(node.elements[0]).gsub(/ $/, '') return "#{op}_#{sub}" when "munderover", "msubsup" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 sup = parse(node.elements[2]) sup = "(#{sup})" unless sup.length == 1 op = parse(node.elements[0]).gsub(/ $/, '') return "#{op}_#{sub}^#{sup}" when "munder" elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u0332" then "ul" when "\u23df" then "ubrace" else "underset" end if accent == "underset" return "underset(#{elem1})(#{parse(node.elements[0])})" else return "#{accent} #{parse(node.elements[0])}" end when "mover" elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u005e" then "hat" when "\u00af" then "bar" #when "\u2192" then "vec" when "->" then "vec" when "." then "dot" when ".." then "ddot" when "\u23de" then "obrace" else "overset" end if accent == "overset" return "overset(#{elem1})(#{parse(node.elements[0])})" else return "#{accent} #{parse(node.elements[0])}" end when "mtable" return "[#{join_parsed_children(node.elements, ',')}]" when "mtr" return "[#{join_parsed_children(node.elements, ',')}]" when "mtd" return join_parsed_children(node.elements, ',') when "mn", "mtext" return join_parsed_children(node.children, '') when "mi" # FIXME: What does this comment have to do with Word? # mi is not meant to have space around it, but Word is conflating operators and operands out = join_parsed_children(node.children) # FIXME: Why do we need to add extra spaces? # out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out return out when "mo" out = join_parsed_children(node.children) out = " #{out} " unless node['fence'] return out when "mstyle" return join_parsed_children(node.children) else "<math xmlns=\"http://www.w3.org/1998/Math/MathML\">" + node.to_xml + "</math>" end end end