m2a.rb in mathml2asciimath-0.0.11

- old
+ new

@@ -1,235 +1,235 @@
 require "nokogiri"
 require "htmlentities"
 require "pp"
 
 module MathML2AsciiMath
+  def self.m2a(xml)
+    normalized = xml
 
-  def self.m2a(x)
-    normalized = x
-
     # &:noblanks skips non-significant whitespaces in MathML
     docxml = Nokogiri::XML.parse(normalized, &:noblanks)
 
     # Get rid of things like
     #           <mtext>&#x2009;</mtext>
-    parse(docxml.root).gsub(/[[:blank:]]/, ' ').unicode_normalize.squeeze(' ')
+    parse(docxml.root).gsub(/[[:blank:]]/, " ").unicode_normalize.squeeze(" ")
   end
 
-  def self.encodechars(x)
-    x.gsub(/\u03b1/, 'alpha').
-      gsub(/\u03b2/, 'beta').
-      gsub(/\u03b3/, 'gamma').
-      gsub(/\u0393/, 'Gamma').
-      gsub(/\u03b4/, 'delta').
-      gsub(/\u0394/, 'Delta').
-      gsub(/\u2206/, 'Delta').
-      gsub(/\u03b5/, 'epsilon').
-      gsub(/\u025b/, 'varepsilon').
-      gsub(/\u03b6/, 'zeta').
-      gsub(/\u03b7/, 'eta').
-      gsub(/\u03b8/, 'theta').
-      gsub(/\u0398/, 'Theta').
-      gsub(/\u03d1/, 'vartheta').
-      gsub(/\u03b9/, 'iota').
-      gsub(/\u03ba/, 'kappa').
-      gsub(/\u03bb/, 'lambda').
-      gsub(/\u039b/, 'Lambda').
-      gsub(/\u03bc/, 'mu').
-      gsub(/\u03bd/, 'nu').
-      gsub(/\u03be/, 'xi').
-      gsub(/\u039e/, 'Xi').
-      gsub(/\u03c0/, 'pi').
-      gsub(/\u03a0/, 'Pi').
-      gsub(/\u03c1/, 'rho').
-      gsub(/\u03c2/, 'beta').
-      gsub(/\u03c3/, 'sigma').
-      gsub(/\u03a3/, 'Sigma').
-      gsub(/\u03c4/, 'tau').
-      gsub(/\u03c5/, 'upsilon').
-      gsub(/\u03c6/, 'phi').
-      gsub(/\u03a6/, 'Phi').
-      gsub(/\u03d5/, 'varphi').
-      gsub(/\u03c7/, 'chi').
-      gsub(/\u03c8/, 'psi').
-      gsub(/\u03a8/, 'Psi').
-      gsub(/\u03c9/, 'omega').
-      gsub(/\u03a9/, 'omega').
-      gsub(/\u22c5/, '*').
-      gsub(/\u2219/, '*').
-      gsub(/\u00b7/, '*').
-      gsub(/\u2217/, '**').
-      gsub(/\u22c6/, '***').
-      gsub(/\//, '//').
-      gsub(/\\/, "\\\\").
-      gsub(/\u00d7/, 'xx').
-      gsub(/\u22c9/, '|><').
-      gsub(/\u22ca/, '><|').
-      gsub(/\u22c8/, '|><|').
-      gsub(/\u00f7/, '-:').
-      gsub(/\u2218/, '@').
-      gsub(/\u2295/, 'o+').
-      gsub(/\u2a01/, 'o+').
-      gsub(/\u2297/, 'ox').
-      gsub(/\u2299/, 'o.').
-      gsub(/\u2211/, 'sum').
-      gsub(/\u220f/, 'prod').
-      gsub(/\u2227/, '^^').
-      gsub(/\u22c0/, '^^^').
-      gsub(/\u2228/, 'vv').
-      gsub(/\u22c1/, 'vvv').
-      gsub(/\u2229/, 'nn').
-      gsub(/\u22c2/, 'nnn').
-      gsub(/\u222a/, 'uu').
-      gsub(/\u22c3/, 'uuu').
-      gsub(/\u2260/, '!=').
-      gsub(/\u2264/, '<=').
-      gsub(/\u2265/, '>=').
-      gsub(/\u227a/, '-<').
-      gsub(/\u227b/, '>-').
-      gsub(/\u2aaf/, '-<=').
-      gsub(/\u2ab0/, '>-=').
-      gsub(/\u2208/, 'in').
-      gsub(/\u2209/, '!in').
-      gsub(/\u2282/, 'sub').
-      gsub(/\u2283/, 'sup').
-      gsub(/\u2286/, 'sube').
-      gsub(/\u2287/, 'supe').
-      gsub(/\u2261/, '-=').
-      gsub(/\u2245/, '~=').
-      gsub(/\u2248/, '~~').
-      gsub(/\u221d/, 'prop').
-      gsub(/\u00ac/, 'not').
-      gsub(/\u21d2/, '=>').
-      gsub(/\u21d4/, '<=>').
-      gsub(/\u2200/, 'AA').
-      gsub(/\u2203/, 'EE').
-      gsub(/\u22a5/, '_|_').
-      gsub(/\u22a4/, 'TT').
-      gsub(/\u22a2/, '|--').
-      gsub(/\u22a8/, '|==').
-      gsub(/\u22a8/, '|==').
-      gsub(/\u2329/, '(:').
-      gsub(/\u232a/, ':)').
-      gsub(/\u2329/, '<<').
-      gsub(/\u27e8/, '<<').
-      gsub(/\u232a/, '>>').
-      gsub(/\u27e9/, '>>').
-      gsub(/\u222b/, 'int').
-      gsub(/\u222e/, 'oint').
-      gsub(/\u2202/, 'del').
-      gsub(/\u2207/, 'grad').
-      gsub(/\u00b1/, '+-').
-      gsub(/\u2205/, "O/").
-      gsub(/\u221e/, 'oo').
-      gsub(/\u2135/, 'aleph').
-      gsub(/\u2234/, ':.').
-      gsub(/\u2235/, ":'").
-      gsub(/\u2220/, "/_").
-      gsub(/\u25b3/, "/_\\").
-      gsub(/\u2032/, "'").
-      gsub(/~/, 'tilde').
-      gsub(/\u00a0\u00a0\u00a0\u00a0/, 'qquad').
-      gsub(/\u00a0\u00a0/, 'quad').
-      gsub(/\u00a0/, "\\ ").
-      gsub(/\u2322/, 'frown').
-      gsub(/\u00a0/, 'quad').
-      gsub(/\u22ef/, 'cdots').
-      gsub(/\u22ee/, 'vdots').
-      gsub(/\u22f1/, 'ddots').
-      gsub(/\u22c4/, 'diamond').
-      gsub(/\u25a1/, 'square').
-      gsub(/\u230a/, '|__').
-      gsub(/\u230b/, '__|').
-      gsub(/\u2308/, '|~').
-      gsub(/\u2309/, '~|').
-      gsub(/\u2102/, 'CC').
-      gsub(/\u2115/, 'NN').
-      gsub(/\u211a/, 'QQ').
-      gsub(/\u211d/, 'RR').
-      gsub(/\u2124/, 'ZZ').
-      gsub(/\u2191/, 'uarr').
-      gsub(/\u2193/, 'darr').
-      gsub(/\u2190/, 'larr').
-      gsub(/\u2194/, 'harr').
-      gsub(/\u21d2/, 'rArr').
-      gsub(/\u21d0/, 'lArr').
-      gsub(/\u21d4/, 'hArr').
-      gsub(/\u2192/, '->').
-      gsub(/\u21a3/, '>->').
-      gsub(/\u21a0/, '->>').
-      gsub(/\u2916/, '>->>').
-      gsub(/\u21a6/, '|->').
-      gsub(/\u2026/, '...').
-      gsub(/\u2212/, '-').
-      gsub(/\u2061/, ''). # function application
-      gsub(/\u2751/, 'square').
-      gsub(/[\u2028\u2029]/, ' ') # normalize thin spaces like \u2009, \u2008
+  def self.encodechars(xml)
+    xml.gsub(/\u03b1/, "alpha")
+      .gsub(/\u03b2/, "beta")
+      .gsub(/\u03b3/, "gamma")
+      .gsub(/\u0393/, "Gamma")
+      .gsub(/\u03b4/, "delta")
+      .gsub(/\u0394/, "Delta")
+      .gsub(/\u2206/, "Delta")
+      .gsub(/\u03b5/, "epsilon")
+      .gsub(/\u025b/, "varepsilon")
+      .gsub(/\u03b6/, "zeta")
+      .gsub(/\u03b7/, "eta")
+      .gsub(/\u03b8/, "theta")
+      .gsub(/\u0398/, "Theta")
+      .gsub(/\u03d1/, "vartheta")
+      .gsub(/\u03b9/, "iota")
+      .gsub(/\u03ba/, "kappa")
+      .gsub(/\u03bb/, "lambda")
+      .gsub(/\u039b/, "Lambda")
+      .gsub(/\u03bc/, "mu")
+      .gsub(/\u03bd/, "nu")
+      .gsub(/\u03be/, "xi")
+      .gsub(/\u039e/, "Xi")
+      .gsub(/\u03c0/, "pi")
+      .gsub(/\u03a0/, "Pi")
+      .gsub(/\u03c1/, "rho")
+      .gsub(/\u03c2/, "beta")
+      .gsub(/\u03c3/, "sigma")
+      .gsub(/\u03a3/, "Sigma")
+      .gsub(/\u03c4/, "tau")
+      .gsub(/\u03c5/, "upsilon")
+      .gsub(/\u03c6/, "phi")
+      .gsub(/\u03a6/, "Phi")
+      .gsub(/\u03d5/, "varphi")
+      .gsub(/\u03c7/, "chi")
+      .gsub(/\u03c8/, "psi")
+      .gsub(/\u03a8/, "Psi")
+      .gsub(/\u03c9/, "omega")
+      .gsub(/\u03a9/, "omega")
+      .gsub(/\u22c5/, "*")
+      .gsub(/\u2219/, "*")
+      .gsub(/\u00b7/, "*")
+      .gsub(/\u2217/, "**")
+      .gsub(/\u22c6/, "***")
+      .gsub(/\//, "//")
+      .gsub(/\\/, "\\\\")
+      .gsub(/\u00d7/, "xx")
+      .gsub(/\u22c9/, "|><")
+      .gsub(/\u22ca/, "><|")
+      .gsub(/\u22c8/, "|><|")
+      .gsub(/\u00f7/, "-:")
+      .gsub(/\u2218/, "@")
+      .gsub(/\u2295/, "o+")
+      .gsub(/\u2a01/, "o+")
+      .gsub(/\u2297/, "ox")
+      .gsub(/\u2299/, "o.")
+      .gsub(/\u2211/, "sum")
+      .gsub(/\u220f/, "prod")
+      .gsub(/\u2227/, "^^")
+      .gsub(/\u22c0/, "^^^")
+      .gsub(/\u2228/, "vv")
+      .gsub(/\u22c1/, "vvv")
+      .gsub(/\u2229/, "nn")
+      .gsub(/\u22c2/, "nnn")
+      .gsub(/\u222a/, "uu")
+      .gsub(/\u22c3/, "uuu")
+      .gsub(/\u2260/, "!=")
+      .gsub(/\u2264/, "<=")
+      .gsub(/\u2265/, ">=")
+      .gsub(/\u227a/, "-<")
+      .gsub(/\u227b/, ">-")
+      .gsub(/\u2aaf/, "-<=")
+      .gsub(/\u2ab0/, ">-=")
+      .gsub(/\u2208/, "in")
+      .gsub(/\u2209/, "!in")
+      .gsub(/\u2282/, "sub")
+      .gsub(/\u2283/, "sup")
+      .gsub(/\u2286/, "sube")
+      .gsub(/\u2287/, "supe")
+      .gsub(/\u2261/, "-=")
+      .gsub(/\u2245/, "~=")
+      .gsub(/\u2248/, "~~")
+      .gsub(/\u221d/, "prop")
+      .gsub(/\u00ac/, "not")
+      .gsub(/\u21d2/, "=>")
+      .gsub(/\u21d4/, "<=>")
+      .gsub(/\u2200/, "AA")
+      .gsub(/\u2203/, "EE")
+      .gsub(/\u22a5/, "_|_")
+      .gsub(/\u22a4/, "TT")
+      .gsub(/\u22a2/, "|--")
+      .gsub(/\u22a8/, "|==")
+      .gsub(/\u22a8/, "|==")
+      .gsub(/\u2329/, "(:")
+      .gsub(/\u232a/, ":)")
+      .gsub(/\u2329/, "<<")
+      .gsub(/\u27e8/, "<<")
+      .gsub(/\u232a/, ">>")
+      .gsub(/\u27e9/, ">>")
+      .gsub(/\u222b/, "int")
+      .gsub(/\u222e/, "oint")
+      .gsub(/\u2202/, "del")
+      .gsub(/\u2207/, "grad")
+      .gsub(/\u00b1/, "+-")
+      .gsub(/\u2205/, "O/")
+      .gsub(/\u221e/, "oo")
+      .gsub(/\u2135/, "aleph")
+      .gsub(/\u2234/, ":.")
+      .gsub(/\u2235/, ":'")
+      .gsub(/\u2220/, "/_")
+      .gsub(/\u25b3/, "/_\\")
+      .gsub(/\u2032/, "'")
+      .gsub(/~/, "tilde")
+      .gsub(/\u00a0\u00a0\u00a0\u00a0/, "qquad")
+      .gsub(/\u00a0\u00a0/, "quad")
+      .gsub(/\u00a0/, "\\ ")
+      .gsub(/\u2322/, "frown")
+      .gsub(/\u00a0/, "quad")
+      .gsub(/\u22ef/, "cdots")
+      .gsub(/\u22ee/, "vdots")
+      .gsub(/\u22f1/, "ddots")
+      .gsub(/\u22c4/, "diamond")
+      .gsub(/\u25a1/, "square")
+      .gsub(/\u230a/, "|__")
+      .gsub(/\u230b/, "__|")
+      .gsub(/\u2308/, "|~")
+      .gsub(/\u2309/, "~|")
+      .gsub(/\u2102/, "CC")
+      .gsub(/\u2115/, "NN")
+      .gsub(/\u211a/, "QQ")
+      .gsub(/\u211d/, "RR")
+      .gsub(/\u2124/, "ZZ")
+      .gsub(/\u2191/, "uarr")
+      .gsub(/\u2193/, "darr")
+      .gsub(/\u2190/, "larr")
+      .gsub(/\u2194/, "harr")
+      .gsub(/\u21d2/, "rArr")
+      .gsub(/\u21d0/, "lArr")
+      .gsub(/\u21d4/, "hArr")
+      .gsub(/\u2192/, "->")
+      .gsub(/\u21a3/, ">->")
+      .gsub(/\u21a0/, "->>")
+      .gsub(/\u2916/, ">->>")
+      .gsub(/\u21a6/, "|->")
+      .gsub(/\u2026/, "...")
+      .gsub(/\u2212/, "-")
+      .gsub(/\u2061/, "") # function application
+      .gsub(/\u2751/, "square")
+      .gsub(/[\u2028\u2029]/, " ") # normalize thin spaces like \u2009, \u2008
   end
 
-  def self.join_parsed_children(children, delimiter=' ')
+  def self.join_parsed_children(children, delimiter = " ")
     children.map do |n|
       parse(n).strip
     end.join(delimiter)
   end
 
   def self.parse(node)
-    out = ''
+    out = ""
     if node.text?
       return encodechars(HTMLEntities.new.decode(node.text))
     end
 
-    case node.name.sub(/^[^:]*:/, '')
+    case node.name.sub(/^[^:]*:/, "")
     when "math"
-      return join_parsed_children(node.elements)
+      join_parsed_children(node.elements)
 
     when "annotation"
-      return ''
+      ""
 
     when "semantics"
-      return join_parsed_children(node.elements)
+      join_parsed_children(node.elements)
 
     when "mrow"
       out = join_parsed_children(node.elements)
-      if %w[mfrac msub munder munderover].include? node.parent.name.sub(/^[^:]*:/, '')
+      if %w[mfrac msub munder munderover]
+        .include? node.parent.name.sub(/^[^:]*:/, "")
         out = "(#{out})"
       end
-      return out
+      out
 
     when "mfenced"
       sym_open = node["open"] || "("
       sym_close = node["close"] || ")"
 
       separator = "," # TODO currently ignore the supplied separators
       out = join_parsed_children(node.elements, separator)
-      return "#{sym_open}#{out}#{sym_close}"
+      "#{sym_open}#{out}#{sym_close}"
 
     when "msqrt"
-      return "sqrt(#{join_parsed_children(node.elements)})"
+      "sqrt(#{join_parsed_children(node.elements)})"
 
     when "mfrac"
-      return "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})"
+      "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})"
 
     when "msup"
       sup = parse(node.elements[1])
       sup = "(#{sup})" unless sup.length == 1
-      op = parse(node.elements[0]).gsub(/ $/, '')
-      return "#{op}^#{sup}"
+      op = parse(node.elements[0]).gsub(/ $/, "")
+      "#{op}^#{sup}"
 
     when "msub"
       sub = parse(node.elements[1])
       sub = "(#{sub})" unless sub.length == 1
-      op = parse(node.elements[0]).gsub(/ $/, '')
-      return "#{op}_#{sub}"
+      op = parse(node.elements[0]).gsub(/ $/, "")
+      "#{op}_#{sub}"
 
     when "munderover", "msubsup"
       sub = parse(node.elements[1])
       sub = "(#{sub})" unless sub.length == 1
       sup = parse(node.elements[2])
       sup = "(#{sup})" unless sup.length == 1
-      op = parse(node.elements[0]).gsub(/ $/, '')
-      return "#{op}_#{sub}^#{sup}"
+      op = parse(node.elements[0]).gsub(/ $/, "")
+      "#{op}_#{sub}^#{sup}"
 
     when "munder"
       elem1 = parse(node.elements[1]).strip
       accent = case elem1
                when "\u0332" then "ul"
@@ -237,67 +237,67 @@
                else
                  "underset"
                end
 
       if accent == "underset"
-        return "underset(#{elem1})(#{parse(node.elements[0])})"
+        "underset(#{elem1})(#{parse(node.elements[0])})"
       else
-        return "#{accent} #{parse(node.elements[0])}"
+        "#{accent} #{parse(node.elements[0])}"
       end
 
     when "mover"
       elem1 = parse(node.elements[1]).strip
       accent = case elem1
                when "\u005e" then "hat"
                when "\u00af" then "bar"
-               #when "\u2192" then "vec"
+               # when "\u2192" then "vec"
                when "->" then "vec"
                when "." then "dot"
                when ".." then "ddot"
                when "\u23de" then "obrace"
                else
                  "overset"
                end
 
       if accent == "overset"
-        return "overset(#{elem1})(#{parse(node.elements[0])})"
+        "overset(#{elem1})(#{parse(node.elements[0])})"
       else
-        return "#{accent} #{parse(node.elements[0])}"
+        "#{accent} #{parse(node.elements[0])}"
       end
 
     when "mtable"
-      return "[#{join_parsed_children(node.elements, ',')}]"
+      "[#{join_parsed_children(node.elements, ',')}]"
 
     when "mtr"
-      return "[#{join_parsed_children(node.elements, ',')}]"
+      "[#{join_parsed_children(node.elements, ',')}]"
 
     when "mtd"
-      return join_parsed_children(node.elements, ',')
+      join_parsed_children(node.elements, ",")
 
     when "mn", "mtext"
-      return join_parsed_children(node.children, '')
+      join_parsed_children(node.children, "")
 
     when "mi"
       # FIXME: What does this comment have to do with Word?
-      # mi is not meant to have space around it, but Word is conflating operators and operands
-      out = join_parsed_children(node.children)
+      # mi is not meant to have space around it,
+      # but Word is conflating operators and operands
+      join_parsed_children(node.children)
 
       # FIXME: Why do we need to add extra spaces?
       # out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out
-      return out
 
     when "mo"
       out = join_parsed_children(node.children)
-      out = " #{out} " unless node['fence']
-      return out
+      out = " #{out} " unless node["fence"]
+      out
 
     when "mstyle"
-      return join_parsed_children(node.children)
+      join_parsed_children(node.children)
 
     else
       "<math xmlns=\"http://www.w3.org/1998/Math/MathML\">" +
         node.to_xml +
-      "</math>"
+        "</math>"
 
     end
   end
 end