lib/mathml2asciimath/m2a.rb in mathml2asciimath-0.0.10 vs lib/mathml2asciimath/m2a.rb in mathml2asciimath-0.0.11

- old
+ new

@@ -1,235 +1,235 @@ require "nokogiri" require "htmlentities" require "pp" module MathML2AsciiMath + def self.m2a(xml) + normalized = xml - def self.m2a(x) - normalized = x - # &:noblanks skips non-significant whitespaces in MathML docxml = Nokogiri::XML.parse(normalized, &:noblanks) # Get rid of things like # <mtext>&#x2009;</mtext> - parse(docxml.root).gsub(/[[:blank:]]/, ' ').unicode_normalize.squeeze(' ') + parse(docxml.root).gsub(/[[:blank:]]/, " ").unicode_normalize.squeeze(" ") end - def self.encodechars(x) - x.gsub(/\u03b1/, 'alpha'). - gsub(/\u03b2/, 'beta'). - gsub(/\u03b3/, 'gamma'). - gsub(/\u0393/, 'Gamma'). - gsub(/\u03b4/, 'delta'). - gsub(/\u0394/, 'Delta'). - gsub(/\u2206/, 'Delta'). - gsub(/\u03b5/, 'epsilon'). - gsub(/\u025b/, 'varepsilon'). - gsub(/\u03b6/, 'zeta'). - gsub(/\u03b7/, 'eta'). - gsub(/\u03b8/, 'theta'). - gsub(/\u0398/, 'Theta'). - gsub(/\u03d1/, 'vartheta'). - gsub(/\u03b9/, 'iota'). - gsub(/\u03ba/, 'kappa'). - gsub(/\u03bb/, 'lambda'). - gsub(/\u039b/, 'Lambda'). - gsub(/\u03bc/, 'mu'). - gsub(/\u03bd/, 'nu'). - gsub(/\u03be/, 'xi'). - gsub(/\u039e/, 'Xi'). - gsub(/\u03c0/, 'pi'). - gsub(/\u03a0/, 'Pi'). - gsub(/\u03c1/, 'rho'). - gsub(/\u03c2/, 'beta'). - gsub(/\u03c3/, 'sigma'). - gsub(/\u03a3/, 'Sigma'). - gsub(/\u03c4/, 'tau'). - gsub(/\u03c5/, 'upsilon'). - gsub(/\u03c6/, 'phi'). - gsub(/\u03a6/, 'Phi'). - gsub(/\u03d5/, 'varphi'). - gsub(/\u03c7/, 'chi'). - gsub(/\u03c8/, 'psi'). - gsub(/\u03a8/, 'Psi'). - gsub(/\u03c9/, 'omega'). - gsub(/\u03a9/, 'omega'). - gsub(/\u22c5/, '*'). - gsub(/\u2219/, '*'). - gsub(/\u00b7/, '*'). - gsub(/\u2217/, '**'). - gsub(/\u22c6/, '***'). - gsub(/\//, '//'). - gsub(/\\/, "\\\\"). - gsub(/\u00d7/, 'xx'). - gsub(/\u22c9/, '|><'). - gsub(/\u22ca/, '><|'). - gsub(/\u22c8/, '|><|'). - gsub(/\u00f7/, '-:'). - gsub(/\u2218/, '@'). - gsub(/\u2295/, 'o+'). - gsub(/\u2a01/, 'o+'). - gsub(/\u2297/, 'ox'). - gsub(/\u2299/, 'o.'). - gsub(/\u2211/, 'sum'). - gsub(/\u220f/, 'prod'). - gsub(/\u2227/, '^^'). - gsub(/\u22c0/, '^^^'). - gsub(/\u2228/, 'vv'). - gsub(/\u22c1/, 'vvv'). - gsub(/\u2229/, 'nn'). - gsub(/\u22c2/, 'nnn'). - gsub(/\u222a/, 'uu'). - gsub(/\u22c3/, 'uuu'). - gsub(/\u2260/, '!='). - gsub(/\u2264/, '<='). - gsub(/\u2265/, '>='). - gsub(/\u227a/, '-<'). - gsub(/\u227b/, '>-'). - gsub(/\u2aaf/, '-<='). - gsub(/\u2ab0/, '>-='). - gsub(/\u2208/, 'in'). - gsub(/\u2209/, '!in'). - gsub(/\u2282/, 'sub'). - gsub(/\u2283/, 'sup'). - gsub(/\u2286/, 'sube'). - gsub(/\u2287/, 'supe'). - gsub(/\u2261/, '-='). - gsub(/\u2245/, '~='). - gsub(/\u2248/, '~~'). - gsub(/\u221d/, 'prop'). - gsub(/\u00ac/, 'not'). - gsub(/\u21d2/, '=>'). - gsub(/\u21d4/, '<=>'). - gsub(/\u2200/, 'AA'). - gsub(/\u2203/, 'EE'). - gsub(/\u22a5/, '_|_'). - gsub(/\u22a4/, 'TT'). - gsub(/\u22a2/, '|--'). - gsub(/\u22a8/, '|=='). - gsub(/\u22a8/, '|=='). - gsub(/\u2329/, '(:'). - gsub(/\u232a/, ':)'). - gsub(/\u2329/, '<<'). - gsub(/\u27e8/, '<<'). - gsub(/\u232a/, '>>'). - gsub(/\u27e9/, '>>'). - gsub(/\u222b/, 'int'). - gsub(/\u222e/, 'oint'). - gsub(/\u2202/, 'del'). - gsub(/\u2207/, 'grad'). - gsub(/\u00b1/, '+-'). - gsub(/\u2205/, "O/"). - gsub(/\u221e/, 'oo'). - gsub(/\u2135/, 'aleph'). - gsub(/\u2234/, ':.'). - gsub(/\u2235/, ":'"). - gsub(/\u2220/, "/_"). - gsub(/\u25b3/, "/_\\"). - gsub(/\u2032/, "'"). - gsub(/~/, 'tilde'). - gsub(/\u00a0\u00a0\u00a0\u00a0/, 'qquad'). - gsub(/\u00a0\u00a0/, 'quad'). - gsub(/\u00a0/, "\\ "). - gsub(/\u2322/, 'frown'). - gsub(/\u00a0/, 'quad'). - gsub(/\u22ef/, 'cdots'). - gsub(/\u22ee/, 'vdots'). - gsub(/\u22f1/, 'ddots'). - gsub(/\u22c4/, 'diamond'). - gsub(/\u25a1/, 'square'). - gsub(/\u230a/, '|__'). - gsub(/\u230b/, '__|'). - gsub(/\u2308/, '|~'). - gsub(/\u2309/, '~|'). - gsub(/\u2102/, 'CC'). - gsub(/\u2115/, 'NN'). - gsub(/\u211a/, 'QQ'). - gsub(/\u211d/, 'RR'). - gsub(/\u2124/, 'ZZ'). - gsub(/\u2191/, 'uarr'). - gsub(/\u2193/, 'darr'). - gsub(/\u2190/, 'larr'). - gsub(/\u2194/, 'harr'). - gsub(/\u21d2/, 'rArr'). - gsub(/\u21d0/, 'lArr'). - gsub(/\u21d4/, 'hArr'). - gsub(/\u2192/, '->'). - gsub(/\u21a3/, '>->'). - gsub(/\u21a0/, '->>'). - gsub(/\u2916/, '>->>'). - gsub(/\u21a6/, '|->'). - gsub(/\u2026/, '...'). - gsub(/\u2212/, '-'). - gsub(/\u2061/, ''). # function application - gsub(/\u2751/, 'square'). - gsub(/[\u2028\u2029]/, ' ') # normalize thin spaces like \u2009, \u2008 + def self.encodechars(xml) + xml.gsub(/\u03b1/, "alpha") + .gsub(/\u03b2/, "beta") + .gsub(/\u03b3/, "gamma") + .gsub(/\u0393/, "Gamma") + .gsub(/\u03b4/, "delta") + .gsub(/\u0394/, "Delta") + .gsub(/\u2206/, "Delta") + .gsub(/\u03b5/, "epsilon") + .gsub(/\u025b/, "varepsilon") + .gsub(/\u03b6/, "zeta") + .gsub(/\u03b7/, "eta") + .gsub(/\u03b8/, "theta") + .gsub(/\u0398/, "Theta") + .gsub(/\u03d1/, "vartheta") + .gsub(/\u03b9/, "iota") + .gsub(/\u03ba/, "kappa") + .gsub(/\u03bb/, "lambda") + .gsub(/\u039b/, "Lambda") + .gsub(/\u03bc/, "mu") + .gsub(/\u03bd/, "nu") + .gsub(/\u03be/, "xi") + .gsub(/\u039e/, "Xi") + .gsub(/\u03c0/, "pi") + .gsub(/\u03a0/, "Pi") + .gsub(/\u03c1/, "rho") + .gsub(/\u03c2/, "beta") + .gsub(/\u03c3/, "sigma") + .gsub(/\u03a3/, "Sigma") + .gsub(/\u03c4/, "tau") + .gsub(/\u03c5/, "upsilon") + .gsub(/\u03c6/, "phi") + .gsub(/\u03a6/, "Phi") + .gsub(/\u03d5/, "varphi") + .gsub(/\u03c7/, "chi") + .gsub(/\u03c8/, "psi") + .gsub(/\u03a8/, "Psi") + .gsub(/\u03c9/, "omega") + .gsub(/\u03a9/, "omega") + .gsub(/\u22c5/, "*") + .gsub(/\u2219/, "*") + .gsub(/\u00b7/, "*") + .gsub(/\u2217/, "**") + .gsub(/\u22c6/, "***") + .gsub(/\//, "//") + .gsub(/\\/, "\\\\") + .gsub(/\u00d7/, "xx") + .gsub(/\u22c9/, "|><") + .gsub(/\u22ca/, "><|") + .gsub(/\u22c8/, "|><|") + .gsub(/\u00f7/, "-:") + .gsub(/\u2218/, "@") + .gsub(/\u2295/, "o+") + .gsub(/\u2a01/, "o+") + .gsub(/\u2297/, "ox") + .gsub(/\u2299/, "o.") + .gsub(/\u2211/, "sum") + .gsub(/\u220f/, "prod") + .gsub(/\u2227/, "^^") + .gsub(/\u22c0/, "^^^") + .gsub(/\u2228/, "vv") + .gsub(/\u22c1/, "vvv") + .gsub(/\u2229/, "nn") + .gsub(/\u22c2/, "nnn") + .gsub(/\u222a/, "uu") + .gsub(/\u22c3/, "uuu") + .gsub(/\u2260/, "!=") + .gsub(/\u2264/, "<=") + .gsub(/\u2265/, ">=") + .gsub(/\u227a/, "-<") + .gsub(/\u227b/, ">-") + .gsub(/\u2aaf/, "-<=") + .gsub(/\u2ab0/, ">-=") + .gsub(/\u2208/, "in") + .gsub(/\u2209/, "!in") + .gsub(/\u2282/, "sub") + .gsub(/\u2283/, "sup") + .gsub(/\u2286/, "sube") + .gsub(/\u2287/, "supe") + .gsub(/\u2261/, "-=") + .gsub(/\u2245/, "~=") + .gsub(/\u2248/, "~~") + .gsub(/\u221d/, "prop") + .gsub(/\u00ac/, "not") + .gsub(/\u21d2/, "=>") + .gsub(/\u21d4/, "<=>") + .gsub(/\u2200/, "AA") + .gsub(/\u2203/, "EE") + .gsub(/\u22a5/, "_|_") + .gsub(/\u22a4/, "TT") + .gsub(/\u22a2/, "|--") + .gsub(/\u22a8/, "|==") + .gsub(/\u22a8/, "|==") + .gsub(/\u2329/, "(:") + .gsub(/\u232a/, ":)") + .gsub(/\u2329/, "<<") + .gsub(/\u27e8/, "<<") + .gsub(/\u232a/, ">>") + .gsub(/\u27e9/, ">>") + .gsub(/\u222b/, "int") + .gsub(/\u222e/, "oint") + .gsub(/\u2202/, "del") + .gsub(/\u2207/, "grad") + .gsub(/\u00b1/, "+-") + .gsub(/\u2205/, "O/") + .gsub(/\u221e/, "oo") + .gsub(/\u2135/, "aleph") + .gsub(/\u2234/, ":.") + .gsub(/\u2235/, ":'") + .gsub(/\u2220/, "/_") + .gsub(/\u25b3/, "/_\\") + .gsub(/\u2032/, "'") + .gsub(/~/, "tilde") + .gsub(/\u00a0\u00a0\u00a0\u00a0/, "qquad") + .gsub(/\u00a0\u00a0/, "quad") + .gsub(/\u00a0/, "\\ ") + .gsub(/\u2322/, "frown") + .gsub(/\u00a0/, "quad") + .gsub(/\u22ef/, "cdots") + .gsub(/\u22ee/, "vdots") + .gsub(/\u22f1/, "ddots") + .gsub(/\u22c4/, "diamond") + .gsub(/\u25a1/, "square") + .gsub(/\u230a/, "|__") + .gsub(/\u230b/, "__|") + .gsub(/\u2308/, "|~") + .gsub(/\u2309/, "~|") + .gsub(/\u2102/, "CC") + .gsub(/\u2115/, "NN") + .gsub(/\u211a/, "QQ") + .gsub(/\u211d/, "RR") + .gsub(/\u2124/, "ZZ") + .gsub(/\u2191/, "uarr") + .gsub(/\u2193/, "darr") + .gsub(/\u2190/, "larr") + .gsub(/\u2194/, "harr") + .gsub(/\u21d2/, "rArr") + .gsub(/\u21d0/, "lArr") + .gsub(/\u21d4/, "hArr") + .gsub(/\u2192/, "->") + .gsub(/\u21a3/, ">->") + .gsub(/\u21a0/, "->>") + .gsub(/\u2916/, ">->>") + .gsub(/\u21a6/, "|->") + .gsub(/\u2026/, "...") + .gsub(/\u2212/, "-") + .gsub(/\u2061/, "") # function application + .gsub(/\u2751/, "square") + .gsub(/[\u2028\u2029]/, " ") # normalize thin spaces like \u2009, \u2008 end - def self.join_parsed_children(children, delimiter=' ') + def self.join_parsed_children(children, delimiter = " ") children.map do |n| parse(n).strip end.join(delimiter) end def self.parse(node) - out = '' + out = "" if node.text? return encodechars(HTMLEntities.new.decode(node.text)) end - case node.name.sub(/^[^:]*:/, '') + case node.name.sub(/^[^:]*:/, "") when "math" - return join_parsed_children(node.elements) + join_parsed_children(node.elements) when "annotation" - return '' + "" when "semantics" - return join_parsed_children(node.elements) + join_parsed_children(node.elements) when "mrow" out = join_parsed_children(node.elements) - if %w[mfrac msub munder munderover].include? node.parent.name.sub(/^[^:]*:/, '') + if %w[mfrac msub munder munderover] + .include? node.parent.name.sub(/^[^:]*:/, "") out = "(#{out})" end - return out + out when "mfenced" sym_open = node["open"] || "(" sym_close = node["close"] || ")" separator = "," # TODO currently ignore the supplied separators out = join_parsed_children(node.elements, separator) - return "#{sym_open}#{out}#{sym_close}" + "#{sym_open}#{out}#{sym_close}" when "msqrt" - return "sqrt(#{join_parsed_children(node.elements)})" + "sqrt(#{join_parsed_children(node.elements)})" when "mfrac" - return "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})" + "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})" when "msup" sup = parse(node.elements[1]) sup = "(#{sup})" unless sup.length == 1 - op = parse(node.elements[0]).gsub(/ $/, '') - return "#{op}^#{sup}" + op = parse(node.elements[0]).gsub(/ $/, "") + "#{op}^#{sup}" when "msub" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 - op = parse(node.elements[0]).gsub(/ $/, '') - return "#{op}_#{sub}" + op = parse(node.elements[0]).gsub(/ $/, "") + "#{op}_#{sub}" when "munderover", "msubsup" sub = parse(node.elements[1]) sub = "(#{sub})" unless sub.length == 1 sup = parse(node.elements[2]) sup = "(#{sup})" unless sup.length == 1 - op = parse(node.elements[0]).gsub(/ $/, '') - return "#{op}_#{sub}^#{sup}" + op = parse(node.elements[0]).gsub(/ $/, "") + "#{op}_#{sub}^#{sup}" when "munder" elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u0332" then "ul" @@ -237,67 +237,67 @@ else "underset" end if accent == "underset" - return "underset(#{elem1})(#{parse(node.elements[0])})" + "underset(#{elem1})(#{parse(node.elements[0])})" else - return "#{accent} #{parse(node.elements[0])}" + "#{accent} #{parse(node.elements[0])}" end when "mover" elem1 = parse(node.elements[1]).strip accent = case elem1 when "\u005e" then "hat" when "\u00af" then "bar" - #when "\u2192" then "vec" + # when "\u2192" then "vec" when "->" then "vec" when "." then "dot" when ".." then "ddot" when "\u23de" then "obrace" else "overset" end if accent == "overset" - return "overset(#{elem1})(#{parse(node.elements[0])})" + "overset(#{elem1})(#{parse(node.elements[0])})" else - return "#{accent} #{parse(node.elements[0])}" + "#{accent} #{parse(node.elements[0])}" end when "mtable" - return "[#{join_parsed_children(node.elements, ',')}]" + "[#{join_parsed_children(node.elements, ',')}]" when "mtr" - return "[#{join_parsed_children(node.elements, ',')}]" + "[#{join_parsed_children(node.elements, ',')}]" when "mtd" - return join_parsed_children(node.elements, ',') + join_parsed_children(node.elements, ",") when "mn", "mtext" - return join_parsed_children(node.children, '') + join_parsed_children(node.children, "") when "mi" # FIXME: What does this comment have to do with Word? - # mi is not meant to have space around it, but Word is conflating operators and operands - out = join_parsed_children(node.children) + # mi is not meant to have space around it, + # but Word is conflating operators and operands + join_parsed_children(node.children) # FIXME: Why do we need to add extra spaces? # out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out - return out when "mo" out = join_parsed_children(node.children) - out = " #{out} " unless node['fence'] - return out + out = " #{out} " unless node["fence"] + out when "mstyle" - return join_parsed_children(node.children) + join_parsed_children(node.children) else "<math xmlns=\"http://www.w3.org/1998/Math/MathML\">" + node.to_xml + - "</math>" + "</math>" end end end