require "nokogiri"
require "htmlentities"
require "pp"

module MathML2AsciiMath

  def self.m2a(x)
    docxml = Nokogiri::XML(x)
    parse(docxml.root).gsub(/  /, " ").
      sub(/^\s+/, "").
      sub(/\s+$/, "")
  end

  def self.encodechars(x)
    x.gsub(/\u03b1/, "alpha").
      gsub(/\u03b2/, "beta").
      gsub(/\u03b3/, "gamma").
      gsub(/\u0393/, "Gamma").
      gsub(/\u03b4/, "delta").
      gsub(/\u0394/, "Delta").
      gsub(/\u2206/, "Delta").
      gsub(/\u03b5/, "epsilon").
      gsub(/\u025b/, "varepsilon").
      gsub(/\u03b6/, "zeta").
      gsub(/\u03b7/, "eta").
      gsub(/\u03b8/, "theta").
      gsub(/\u0398/, "Theta").
      gsub(/\u03d1/, "vartheta").
      gsub(/\u03b9/, "iota").
      gsub(/\u03ba/, "kappa").
      gsub(/\u03bb/, "lambda").
      gsub(/\u039b/, "Lambda").
      gsub(/\u03bc/, "mu").
      gsub(/\u03bd/, "nu").
      gsub(/\u03be/, "xi").
      gsub(/\u039e/, "Xi").
      gsub(/\u03c0/, "pi").
      gsub(/\u03a0/, "Pi").
      gsub(/\u03c1/, "rho").
      gsub(/\u03c2/, "beta").
      gsub(/\u03c3/, "sigma").
      gsub(/\u03a3/, "Sigma").
      gsub(/\u03c4/, "tau").
      gsub(/\u03c5/, "upsilon").
      gsub(/\u03c6/, "phi").
      gsub(/\u03a6/, "Phi").
      gsub(/\u03d5/, "varphi").
      gsub(/\u03c7/, "chi").
      gsub(/\u03c8/, "psi").
      gsub(/\u03a8/, "Psi").
      gsub(/\u03c9/, "omega").
      gsub(/\u03a9/, "omega").
      gsub(/\u22c5/, "*").
      gsub(/\u2219/, "*").
      gsub(/\u00b7/, "*").
      gsub(/\u2217/, "**").
      gsub(/\u22c6/, "***").
      gsub(/\//, "//").
      gsub(/\\/, "\\\\").
      gsub(/\u00d7/, "xx").
      gsub(/\u22c9/, "|><").
      gsub(/\u22ca/, "><|").
      gsub(/\u22c8/, "|><|").
      gsub(/\u00f7/, "-:").
      gsub(/\u2218/, "@").
      gsub(/\u2295/, "o+").
      gsub(/\u2a01/, "o+").
      gsub(/\u2297/, "ox").
      gsub(/\u2299/, "o.").
      gsub(/\u2211/, "sum").
      gsub(/\u220f/, "prod").
      gsub(/\u2227/, "^^").
      gsub(/\u22c0/, "^^^").
      gsub(/\u2228/, "vv").
      gsub(/\u22c1/, "vvv").
      gsub(/\u2229/, "nn").
      gsub(/\u22c2/, "nnn").
      gsub(/\u222a/, "uu").
      gsub(/\u22c3/, "uuu").
      gsub(/\u2260/, "!=").
      gsub(/\u2264/, "<=").
      gsub(/\u2265/, ">=").
      gsub(/\u227a/, "-<").
      gsub(/\u227b/, ">-").
      gsub(/\u2aaf/, "-<=").
      gsub(/\u2ab0/, ">-=").
      gsub(/\u2208/, "in").
      gsub(/\u2209/, "!in").
      gsub(/\u2282/, "sub").
      gsub(/\u2283/, "sup").
      gsub(/\u2286/, "sube").
      gsub(/\u2287/, "supe").
      gsub(/\u2261/, "-=").
      gsub(/\u2245/, "~=").
      gsub(/\u2248/, "~~").
      gsub(/\u221d/, "prop").
      gsub(/\u00ac/, "not").
      gsub(/\u21d2/, "=>").
      gsub(/\u21d4/, "<=>").
      gsub(/\u2200/, "AA").
      gsub(/\u2203/, "EE").
      gsub(/\u22a5/, "_|_").
      gsub(/\u22a4/, "TT").
      gsub(/\u22a2/, "|--").
      gsub(/\u22a8/, "|==").
      gsub(/\u22a8/, "|==").
      gsub(/\u2329/, "(:").
      gsub(/\u232a/, ":)").
      gsub(/\u2329/, "<<").
      gsub(/\u27e8/, "<<").
      gsub(/\u232a/, ">>").
      gsub(/\u27e9/, ">>").
      gsub(/\u222e/, "oint").
      gsub(/\u2202/, "del").
      gsub(/\u2207/, "grad").
      gsub(/\u00b1/, "+-").
      gsub(/\u2205/, "O/").
      gsub(/\u221e/, "oo").
      gsub(/\u2135/, "aleph").
      gsub(/\u2234/, ":.").
      gsub(/\u2235/, ":'").
      gsub(/\u2220/, "/_").
      gsub(/\u25b3/, "/_\\").
      gsub(/\u2032/, "'").
      gsub(/~/, "tilde").
      gsub(/\u00a0\u00a0\u00a0\u00a0/, "qquad").
      gsub(/\u00a0\u00a0/, "quad").
      gsub(/\u00a0/, "\\ ").
      gsub(/\u2322/, "frown").
      gsub(/\u00a0/, "quad").
      gsub(/\u22ef/, "cdots").
      gsub(/\u22ee/, "vdots").
      gsub(/\u22f1/, "ddots").
      gsub(/\u22c4/, "diamond").
      gsub(/\u25a1/, "square").
      gsub(/\u230a/, "|__").
      gsub(/\u230b/, "__|").
      gsub(/\u2308/, "|~").
      gsub(/\u2309/, "~|").
      gsub(/\u2102/, "CC").
      gsub(/\u2115/, "NN").
      gsub(/\u211a/, "QQ").
      gsub(/\u211d/, "RR").
      gsub(/\u2124/, "ZZ").
      gsub(/\u2191/, "uarr").
      gsub(/\u2193/, "darr").
      gsub(/\u2190/, "larr").
      gsub(/\u2194/, "harr").
      gsub(/\u21d2/, "rArr").
      gsub(/\u21d0/, "lArr").
      gsub(/\u21d4/, "hArr").
      gsub(/\u2192/, "->").
      gsub(/\u21a3/, ">->").
      gsub(/\u21a0/, "->>").
      gsub(/\u2916/, ">->>").
      gsub(/\u21a6/, "|->").
      gsub(/\u2026/, "...").
      gsub(/\u2212/, "-").
      gsub(/\u2061/, ""). # function application
      gsub(/\u2751/, "square")
  end

  def self.parse(node)
    out = ""
    if node.text?
      return encodechars(HTMLEntities.new.decode(node.text))
    end

    case node.name.sub(/^[^:]*:/, "")
    when "math"
      outarr = []
      node.elements.each { |n| outarr << parse(n).strip }
      return outarr.join(" ")

    when "annotation"
      return ""

    when "semantics"
      outarr = []
      node.elements.each { |n| outarr << parse(n).strip }
      return outarr.join(" ")

    when "mrow"
      outarr = []
      node.children.each { |n| outarr << parse(n).strip }
      out = outarr.join(" ")
      if %w{mfrac msub munder munderover}.include? node.parent.name.sub(/^[^:]*:/, "")
        out = "(#{out})"
      end
      return out

    when "mfenced"
      outarr = []
      open = node["open"] || "("
      close = node["close"] || ")"
      separator = "," # TODO currently ignore the supplied separators
      node.elements.each { |n| outarr << parse(n).strip }
      out = outarr.join(separator)
      return "#{open}#{out}#{close}"

    when "msqrt"
      outarr = []
      node.elements.each { |n| outarr << parse(n).strip }
      return "sqrt(#{outarr.join(" ")})"

    when "mfrac"
      return "(#{parse(node.elements[0])})/(#{parse(node.elements[1])})"

    when "msup"
      sup = parse(node.elements[1])
      sup = "(#{sup})" unless sup.length == 1
      op = parse(node.elements[0]).gsub(/ $/, "")
      return "#{op}^#{sup}"

    when "msub"
      sub = parse(node.elements[1])
      sub = "(#{sub})" unless sub.length == 1
      op = parse(node.elements[0]).gsub(/ $/, "")
      return "#{op}_#{sub}"

    when "munderover", "msubsup"
      sub = parse(node.elements[1])
      sub = "(#{sub})" unless sub.length == 1
      sup = parse(node.elements[2])
      sup = "(#{sup})" unless sup.length == 1
      op = parse(node.elements[0]).gsub(/ $/, "")
      return "#{op}_#{sub}^#{sup}"

    when "munder"
      elem1 = parse(node.elements[1]).sub(/^\s+/, "").sub(/\s+$/, "")
      accent = case elem1
               when "\u0332" then "ul"
               when "\u23df" then "ubrace"
               else
                 "underset"
               end
      if accent == "underset"
        return "underset(#{elem1})(#{parse(node.elements[0])})"
      else
        return "#{accent} #{parse(node.elements[0])}"
      end
    when "mover"
      elem1 = parse(node.elements[1]).sub(/^\s+/, "").sub(/\s+$/, "")
      accent = case elem1
               when "\u005e" then "hat"
               when "\u00af" then "bar"
               #when "\u2192" then "vec"
               when "->" then "vec"
               when "." then "dot"
               when ".." then "ddot"
               when "\u23de" then "obrace"
               else
                 "overset"
               end
      if accent == "overset"
        return "overset(#{elem1})(#{parse(node.elements[0])})"
      else
        return "#{accent} #{parse(node.elements[0])}"
      end

    when "mtable"
      outarr = []
      node.elements.each { |n| outarr << parse(n).strip }
      return "[#{outarr.join(",")}]"

    when "mtr"
      outarr = []
      node.elements.each { |n| outarr << parse(n).strip }
      return "[#{outarr.join(",")}]"

    when "mtd"
      outarr = []
      node.elements.each { |n| outarr << parse(n).strip }
      return "#{outarr.join(",")}"

    when "mn", "mtext"
      outarr = []
      node.children.each { |n| outarr << parse(n).strip }
      return "#{outarr.join("")}"

    when "mi"
      # mi is not meant to have space around it, but Word is conflating operators and operands
      outarr = []
      node.children.each { |n| outarr << parse(n).strip }
      out = outarr.join(" ")
      out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out
      return out

    when "mo"
      outarr = []
      node.children.each { |n| outarr << parse(n).strip }
      out = outarr.join(" ")
      out = " #{out} " unless node["fence"]
      return out

    when "mstyle"
      outarr = []
      node.children.each { |n| outarr << parse(n).strip }
      out = outarr.join(" ")
    else
      node.to_xml

    end
  end
end