lib/mathml2asciimath/m2a.rb in mathml2asciimath-0.0.9 vs lib/mathml2asciimath/m2a.rb in mathml2asciimath-0.0.10

- old
+ new

@@ -3,246 +3,251 @@ require "pp" module MathML2AsciiMath def self.m2a(x) - docxml = Nokogiri::XML(x) - parse(docxml.root).gsub(/ /, " "). - sub(/^\s+/, ""). - sub(/\s+$/, "") + normalized = x + + # &:noblanks skips non-significant whitespaces in MathML + docxml = Nokogiri::XML.parse(normalized, &:noblanks) + + # Get rid of things like + # <mtext>&#x2009;</mtext> + parse(docxml.root).gsub(/[[:blank:]]/, ' ').unicode_normalize.squeeze(' ') end def self.encodechars(x) - x.gsub(/\u03b1/, "alpha"). - gsub(/\u03b2/, "beta"). - gsub(/\u03b3/, "gamma"). - gsub(/\u0393/, "Gamma"). - gsub(/\u03b4/, "delta"). - gsub(/\u0394/, "Delta"). - gsub(/\u2206/, "Delta"). - gsub(/\u03b5/, "epsilon"). - gsub(/\u025b/, "varepsilon"). - gsub(/\u03b6/, "zeta"). - gsub(/\u03b7/, "eta"). - gsub(/\u03b8/, "theta"). - gsub(/\u0398/, "Theta"). - gsub(/\u03d1/, "vartheta"). - gsub(/\u03b9/, "iota"). - gsub(/\u03ba/, "kappa"). - gsub(/\u03bb/, "lambda"). - gsub(/\u039b/, "Lambda"). - gsub(/\u03bc/, "mu"). - gsub(/\u03bd/, "nu"). - gsub(/\u03be/, "xi"). - gsub(/\u039e/, "Xi"). - gsub(/\u03c0/, "pi"). - gsub(/\u03a0/, "Pi"). - gsub(/\u03c1/, "rho"). - gsub(/\u03c2/, "beta"). - gsub(/\u03c3/, "sigma"). - gsub(/\u03a3/, "Sigma"). - gsub(/\u03c4/, "tau"). - gsub(/\u03c5/, "upsilon"). - gsub(/\u03c6/, "phi"). - gsub(/\u03a6/, "Phi"). - gsub(/\u03d5/, "varphi"). - gsub(/\u03c7/, "chi"). - gsub(/\u03c8/, "psi"). - gsub(/\u03a8/, "Psi"). - gsub(/\u03c9/, "omega"). - gsub(/\u03a9/, "omega"). - gsub(/\u22c5/, "*"). - gsub(/\u2219/, "*"). - gsub(/\u00b7/, "*"). - gsub(/\u2217/, "**"). - gsub(/\u22c6/, "***"). - gsub(/\//, "//"). + x.gsub(/\u03b1/, 'alpha'). + gsub(/\u03b2/, 'beta'). + gsub(/\u03b3/, 'gamma'). + gsub(/\u0393/, 'Gamma'). + gsub(/\u03b4/, 'delta'). + gsub(/\u0394/, 'Delta'). + gsub(/\u2206/, 'Delta'). + gsub(/\u03b5/, 'epsilon'). + gsub(/\u025b/, 'varepsilon'). + gsub(/\u03b6/, 'zeta'). + gsub(/\u03b7/, 'eta'). + gsub(/\u03b8/, 'theta'). + gsub(/\u0398/, 'Theta'). + gsub(/\u03d1/, 'vartheta'). + gsub(/\u03b9/, 'iota'). + gsub(/\u03ba/, 'kappa'). + gsub(/\u03bb/, 'lambda'). + gsub(/\u039b/, 'Lambda'). + gsub(/\u03bc/, 'mu'). + gsub(/\u03bd/, 'nu'). + gsub(/\u03be/, 'xi'). + gsub(/\u039e/, 'Xi'). + gsub(/\u03c0/, 'pi'). + gsub(/\u03a0/, 'Pi'). + gsub(/\u03c1/, 'rho'). + gsub(/\u03c2/, 'beta'). + gsub(/\u03c3/, 'sigma'). + gsub(/\u03a3/, 'Sigma'). + gsub(/\u03c4/, 'tau'). + gsub(/\u03c5/, 'upsilon'). + gsub(/\u03c6/, 'phi'). + gsub(/\u03a6/, 'Phi'). + gsub(/\u03d5/, 'varphi'). + gsub(/\u03c7/, 'chi'). + gsub(/\u03c8/, 'psi'). + gsub(/\u03a8/, 'Psi'). + gsub(/\u03c9/, 'omega'). + gsub(/\u03a9/, 'omega'). + gsub(/\u22c5/, '*'). + gsub(/\u2219/, '*'). + gsub(/\u00b7/, '*'). + gsub(/\u2217/, '**'). + gsub(/\u22c6/, '***'). + gsub(/\//, '//'). gsub(/\\/, "\\\\"). - gsub(/\u00d7/, "xx"). - gsub(/\u22c9/, "|><"). - gsub(/\u22ca/, "><|"). - gsub(/\u22c8/, "|><|"). - gsub(/\u00f7/, "-:"). - gsub(/\u2218/, "@"). - gsub(/\u2295/, "o+"). - gsub(/\u2a01/, "o+"). - gsub(/\u2297/, "ox"). - gsub(/\u2299/, "o."). - gsub(/\u2211/, "sum"). - gsub(/\u220f/, "prod"). - gsub(/\u2227/, "^^"). - gsub(/\u22c0/, "^^^"). - gsub(/\u2228/, "vv"). - gsub(/\u22c1/, "vvv"). - gsub(/\u2229/, "nn"). - gsub(/\u22c2/, "nnn"). - gsub(/\u222a/, "uu"). - gsub(/\u22c3/, "uuu"). - gsub(/\u2260/, "!="). - gsub(/\u2264/, "<="). - gsub(/\u2265/, ">="). - gsub(/\u227a/, "-<"). - gsub(/\u227b/, ">-"). - gsub(/\u2aaf/, "-<="). - gsub(/\u2ab0/, ">-="). - gsub(/\u2208/, "in"). - gsub(/\u2209/, "!in"). - gsub(/\u2282/, "sub"). - gsub(/\u2283/, "sup"). - gsub(/\u2286/, "sube"). - gsub(/\u2287/, "supe"). - gsub(/\u2261/, "-="). - gsub(/\u2245/, "~="). - gsub(/\u2248/, "~~"). - gsub(/\u221d/, "prop"). - gsub(/\u00ac/, "not"). - gsub(/\u21d2/, "=>"). - gsub(/\u21d4/, "<=>"). - gsub(/\u2200/, "AA"). - gsub(/\u2203/, "EE"). - gsub(/\u22a5/, "_|_"). - gsub(/\u22a4/, "TT"). - gsub(/\u22a2/, "|--"). - gsub(/\u22a8/, "|=="). - gsub(/\u22a8/, "|=="). - gsub(/\u2329/, "(:"). - gsub(/\u232a/, ":)"). - gsub(/\u2329/, "<<"). - gsub(/\u27e8/, "<<"). - gsub(/\u232a/, ">>"). - gsub(/\u27e9/, ">>"). - gsub(/\u222e/, "oint"). - gsub(/\u2202/, "del"). - gsub(/\u2207/, "grad"). - gsub(/\u00b1/, "+-"). + gsub(/\u00d7/, 'xx'). + gsub(/\u22c9/, '|><'). + gsub(/\u22ca/, '><|'). + gsub(/\u22c8/, '|><|'). + gsub(/\u00f7/, '-:'). + gsub(/\u2218/, '@'). + gsub(/\u2295/, 'o+'). + gsub(/\u2a01/, 'o+'). + gsub(/\u2297/, 'ox'). + gsub(/\u2299/, 'o.'). + gsub(/\u2211/, 'sum'). + gsub(/\u220f/, 'prod'). + gsub(/\u2227/, '^^'). + gsub(/\u22c0/, '^^^'). + gsub(/\u2228/, 'vv'). + gsub(/\u22c1/, 'vvv'). + gsub(/\u2229/, 'nn'). + gsub(/\u22c2/, 'nnn'). + gsub(/\u222a/, 'uu'). + gsub(/\u22c3/, 'uuu'). + gsub(/\u2260/, '!='). + gsub(/\u2264/, '<='). + gsub(/\u2265/, '>='). + gsub(/\u227a/, '-<'). + gsub(/\u227b/, '>-'). + gsub(/\u2aaf/, '-<='). + gsub(/\u2ab0/, '>-='). + gsub(/\u2208/, 'in'). + gsub(/\u2209/, '!in'). + gsub(/\u2282/, 'sub'). + gsub(/\u2283/, 'sup'). + gsub(/\u2286/, 'sube'). + gsub(/\u2287/, 'supe'). + gsub(/\u2261/, '-='). + gsub(/\u2245/, '~='). + gsub(/\u2248/, '~~'). + gsub(/\u221d/, 'prop'). + gsub(/\u00ac/, 'not'). + gsub(/\u21d2/, '=>'). + gsub(/\u21d4/, '<=>'). + gsub(/\u2200/, 'AA'). + gsub(/\u2203/, 'EE'). + gsub(/\u22a5/, '_|_'). + gsub(/\u22a4/, 'TT'). + gsub(/\u22a2/, '|--'). + gsub(/\u22a8/, '|=='). + gsub(/\u22a8/, '|=='). + gsub(/\u2329/, '(:'). + gsub(/\u232a/, ':)'). + gsub(/\u2329/, '<<'). + gsub(/\u27e8/, '<<'). + gsub(/\u232a/, '>>'). + gsub(/\u27e9/, '>>'). + gsub(/\u222b/, 'int'). + gsub(/\u222e/, 'oint'). + gsub(/\u2202/, 'del'). + gsub(/\u2207/, 'grad'). + gsub(/\u00b1/, '+-'). gsub(/\u2205/, "O/"). - gsub(/\u221e/, "oo"). - gsub(/\u2135/, "aleph"). - gsub(/\u2234/, ":."). + gsub(/\u221e/, 'oo'). + gsub(/\u2135/, 'aleph'). + gsub(/\u2234/, ':.'). gsub(/\u2235/, ":'"). gsub(/\u2220/, "/_"). gsub(/\u25b3/, "/_\\"). gsub(/\u2032/, "'"). - gsub(/~/, "tilde"). - gsub(/\u00a0\u00a0\u00a0\u00a0/, "qquad"). - gsub(/\u00a0\u00a0/, "quad"). + gsub(/~/, 'tilde'). + gsub(/\u00a0\u00a0\u00a0\u00a0/, 'qquad'). + gsub(/\u00a0\u00a0/, 'quad'). gsub(/\u00a0/, "\\ "). - gsub(/\u2322/, "frown"). - gsub(/\u00a0/, "quad"). - gsub(/\u22ef/, "cdots"). - gsub(/\u22ee/, "vdots"). - gsub(/\u22f1/, "ddots"). - gsub(/\u22c4/, "diamond"). - gsub(/\u25a1/, "square"). - gsub(/\u230a/, "|__"). - gsub(/\u230b/, "__|"). - gsub(/\u2308/, "|~"). - gsub(/\u2309/, "~|"). - gsub(/\u2102/, "CC"). - gsub(/\u2115/, "NN"). - gsub(/\u211a/, "QQ"). - gsub(/\u211d/, "RR"). - gsub(/\u2124/, "ZZ"). - gsub(/\u2191/, "uarr"). - gsub(/\u2193/, "darr"). - gsub(/\u2190/, "larr"). - gsub(/\u2194/, "harr"). - gsub(/\u21d2/, "rArr"). - gsub(/\u21d0/, "lArr"). - gsub(/\u21d4/, "hArr"). - gsub(/\u2192/, "->"). - gsub(/\u21a3/, ">->"). - gsub(/\u21a0/, "->>"). - gsub(/\u2916/, ">->>"). - gsub(/\u21a6/, "|->"). - gsub(/\u2026/, "..."). - gsub(/\u2212/, "-"). - gsub(/\u2061/, ""). # function application - gsub(/\u2751/, "square") + gsub(/\u2322/, 'frown'). + gsub(/\u00a0/, 'quad'). + gsub(/\u22ef/, 'cdots'). + gsub(/\u22ee/, 'vdots'). + gsub(/\u22f1/, 'ddots'). + gsub(/\u22c4/, 'diamond'). + gsub(/\u25a1/, 'square'). + gsub(/\u230a/, '|__'). + gsub(/\u230b/, '__|'). + gsub(/\u2308/, '|~'). + gsub(/\u2309/, '~|'). + gsub(/\u2102/, 'CC'). + gsub(/\u2115/, 'NN'). + gsub(/\u211a/, 'QQ'). + gsub(/\u211d/, 'RR'). + gsub(/\u2124/, 'ZZ'). + gsub(/\u2191/, 'uarr'). + gsub(/\u2193/, 'darr'). + gsub(/\u2190/, 'larr'). + gsub(/\u2194/, 'harr'). + gsub(/\u21d2/, 'rArr'). + gsub(/\u21d0/, 'lArr'). + gsub(/\u21d4/, 'hArr'). + gsub(/\u2192/, '->'). + gsub(/\u21a3/, '>->'). + gsub(/\u21a0/, '->>'). + gsub(/\u2916/, '>->>'). + gsub(/\u21a6/, '|->'). + gsub(/\u2026/, '...'). + gsub(/\u2212/, '-'). + gsub(/\u2061/, ''). # function application + gsub(/\u2751/, 'square'). + gsub(/[\u2028\u2029]/, ' ') # normalize thin spaces like \u2009, \u2008 end + def self.join_parsed_children(children, delimiter=' ') + children.map do |n| + parse(n).strip + end.join(delimiter) + end + def self.parse(node) - out = "" + out = '' if node.text? return encodechars(HTMLEntities.new.decode(node.text)) end - case node.name.sub(/^[^:]*:/, "") + case node.name.sub(/^[^:]*:/, '') when "math" - outarr = [] - node.elements.each { |n| outarr << parse(n).strip } - return outarr.join(" ") + return join_parsed_children(node.elements) when "annotation" - return "" + return '' when "semantics" - outarr = [] - node.elements.each { |n| outarr << parse(n).strip } - return outarr.join(" ") + return join_parsed_children(node.elements) when "mrow" - outarr = [] - node.children.each { |n| outarr << parse(n).strip } - out = outarr.join(" ") - if %w{mfrac msub munder munderover}.include? node.parent.name.sub(/^[^:]*:/, "") + out = join_parsed_children(node.elements) + if %w[mfrac msub munder munderover].include? node.parent.name.sub(/^[^:]*:/, '') out = "(#{out})" end return out when "mfenced" - outarr = [] - open = node["open"] || "(" - close = node["close"] || ")" + sym_open = node["open"] || "(" + sym_close = node["close"] || ")" + separator = "," # TODO currently ignore the supplied separators - node.elements.each { |n| outarr << parse(n).strip } - out = outarr.join(separator) - return "#{open}#{out}#{close}" + out = join_parsed_children(node.elements, separator) + return "#{sym_open}#{out}#{sym_close}" when "msqrt" - outarr = [] - node.elements.each { |n| outarr << parse(n).strip } - return "sqrt(#{outarr.join(" ")})" + return "sqrt(#{join_parsed_children(node.elements)})" when "mfrac" return "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})" when "msup" sup = parse(node.elements[1]) sup = "(#{sup})" unless sup.length == 1 - op = parse(node.elements[0]).gsub(/ $/, "") + op = parse(node.elements[0]).gsub(/ $/, '') return "#{op}^#{sup}" when "msub" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 - op = parse(node.elements[0]).gsub(/ $/, "") + op = parse(node.elements[0]).gsub(/ $/, '') return "#{op}_#{sub}" when "munderover", "msubsup" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 sup = parse(node.elements[2]) sup = "(#{sup})" unless sup.length == 1 - op = parse(node.elements[0]).gsub(/ $/, "") + op = parse(node.elements[0]).gsub(/ $/, '') return "#{op}_#{sub}^#{sup}" when "munder" - elem1 = parse(node.elements[1]).sub(/^\s+/, "").sub(/\s+$/, "") + elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u0332" then "ul" when "\u23df" then "ubrace" else "underset" end + if accent == "underset" return "underset(#{elem1})(#{parse(node.elements[0])})" else return "#{accent} #{parse(node.elements[0])}" end + when "mover" - elem1 = parse(node.elements[1]).sub(/^\s+/, "").sub(/\s+$/, "") + elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u005e" then "hat" when "\u00af" then "bar" #when "\u2192" then "vec" when "->" then "vec" @@ -250,56 +255,49 @@ when ".." then "ddot" when "\u23de" then "obrace" else "overset" end + if accent == "overset" return "overset(#{elem1})(#{parse(node.elements[0])})" else return "#{accent} #{parse(node.elements[0])}" end when "mtable" - outarr = [] - node.elements.each { |n| outarr << parse(n).strip } - return "[#{outarr.join(",")}]" + return "[#{join_parsed_children(node.elements, ',')}]" when "mtr" - outarr = [] - node.elements.each { |n| outarr << parse(n).strip } - return "[#{outarr.join(",")}]" + return "[#{join_parsed_children(node.elements, ',')}]" when "mtd" - outarr = [] - node.elements.each { |n| outarr << parse(n).strip } - return "#{outarr.join(",")}" + return join_parsed_children(node.elements, ',') when "mn", "mtext" - outarr = [] - node.children.each { |n| outarr << parse(n).strip } - return "#{outarr.join("")}" + return join_parsed_children(node.children, '') when "mi" + # FIXME: What does this comment have to do with Word? # mi is not meant to have space around it, but Word is conflating operators and operands - outarr = [] - node.children.each { |n| outarr << parse(n).strip } - out = outarr.join(" ") - out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out + out = join_parsed_children(node.children) + + # FIXME: Why do we need to add extra spaces? + # out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out return out when "mo" - outarr = [] - node.children.each { |n| outarr << parse(n).strip } - out = outarr.join(" ") - out = " #{out} " unless node["fence"] + out = join_parsed_children(node.children) + out = " #{out} " unless node['fence'] return out when "mstyle" - outarr = [] - node.children.each { |n| outarr << parse(n).strip } - out = outarr.join(" ") + return join_parsed_children(node.children) + else - node.to_xml + "<math xmlns=\"http://www.w3.org/1998/Math/MathML\">" + + node.to_xml + + "</math>" end end end