require "uuidtools"
require "plurimath"
require "htmlentities"
require "nokogiri"
require "plane1converter"
require "metanorma-utils"
module Nokogiri
module XML
class Node
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
def ooxml_xpath(path)
p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
xpath(p, "m" => OOXML_NS)
end
end
end
end
class Html2Doc
def progress_conv(idx, step, total, threshold, msg)
return unless (idx % step).zero? && total > threshold && idx.positive?
warn "#{msg} #{idx} of #{total}"
end
def unwrap_accents(doc)
doc.xpath("//*[@accent = 'true']").each do |x|
x.elements.length > 1 or next
x.elements[1].name == "mrow" and
x.elements[1].replace(x.elements[1].children)
end
doc
end
MATHML_NS = "http://www.w3.org/1998/Math/MathML".freeze
# random fixes to MathML input that OOXML needs to render properly
def ooxml_cleanup(math, docnamespaces)
# encode_math(
unwrap_accents(
mathml_preserve_space(
mathml_insert_rows(math, docnamespaces), docnamespaces
),
)
# )
math.add_namespace(nil, MATHML_NS)
math
end
def encode_math(elem)
elem.traverse do |e|
e.text? or next
e.text.strip.empty? and next
e.replace(@c.encode(e.text, :hexadecimal))
end
elem
end
def mathml_insert_rows(math, docnamespaces)
math.xpath(%w(msup msub msubsup munder mover munderover)
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
next unless x.next_element && x.next_element != "mrow"
x.next_element.wrap("")
end
math
end
def mathml_preserve_space(math, docnamespaces)
math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
x.children = x.children.to_xml.gsub(/^\s/, " ").gsub(/\s$/, " ")
end
math
end
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
def wrap_text(elem, wrapper)
elem.traverse do |e|
e.text? or next
e.text.strip.empty? and next
e.wrap(wrapper)
end
end
def unitalic(math)
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
wrap_text(x, "")
end
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
wrap_text(x,
"")
end
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
wrap_text(x, "")
end
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
wrap_text(x,
"")
end
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
to_plane1(x, :monospace)
end
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
to_plane1(x, :doublestruck)
end
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
to_plane1(x, :script)
end
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
to_plane1(x, :scriptbold)
end
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
to_plane1(x, :fraktur)
end
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
to_plane1(x, :frakturbold)
end
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
to_plane1(x, :sans)
end
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
to_plane1(x, :sansbold)
end
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
to_plane1(x, :sansitalic)
end
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
to_plane1(x, :sansbolditalic)
end
math
end
def to_plane1(xml, font)
xml.traverse do |n|
next unless n.text?
n.replace(Plane1Converter.conv(@c.decode(n.text), font))
end
xml
end
def mathml_to_ooml(docxml)
docnamespaces = docxml.collect_namespaces
m = docxml.xpath("//*[local-name() = 'math']")
m.each_with_index do |x, i|
progress_conv(i, 100, m.size, 500, "Math OOXML")
mathml_to_ooml1(x, docnamespaces)
end
end
# We need span and em not to be namespaced. Word can't deal with explicit
# namespaces.
# We will end up stripping them out again under Nokogiri 1.11, which correctly
# insists on inheriting namespace from parent.
def ooml_clean(xml)
xml.to_xml(indent: 0)
.gsub(/<\?[^>]+>\s*/, "")
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
# .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
end
def mathml_to_ooml1(xml, docnamespaces)
doc = Nokogiri::XML::Document::new
doc.root = ooxml_cleanup(xml, docnamespaces)
# d = xml.parent["block"] != "false" # display_style
ooxml = Nokogiri::XML(Plurimath::Math
.parse(doc.to_xml(indent: 0), :mathml).to_omml(split_on_linebreak: true))
ooxml = unitalic(accent_tr(ooxml))
ooxml = ooml_clean(uncenter(xml, ooxml))
xml.swap(ooxml)
end
def accent_tr(xml)
xml.ooxml_xpath(".//accPr/chr").each do |x|
x["m:val"] &&= accent_tr1(x["m:val"])
x["val"] &&= accent_tr1(x["val"])
end
xml
end
def accent_tr1(accent)
case accent
when "\u2192" then "\u20D7"
when "^" then "\u0302"
when "~" then "\u0303"
else accent
end
end
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
def math_only_para?(node)
x = node.dup
x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
# namespace can go missing during processing
x.text.strip.empty?
end
def math_block?(ooxml, mathml)
# ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
mathml["displaystyle"] == "true" &&
ooxml.xpath("./m:oMath", "m" => OOXML_NS).size <= 1
end
STYLE_BEARING_NODE =
%w(p div td th li).map { |x| ".//ancestor::#{x}" }.join(" | ").freeze
# if oomml has no siblings, by default it is centered; override this with
# left/right if parent is so tagged
# also if ooml has mathPara already, or is in para with only oMath content
def uncenter(math, ooxml)
alignnode = math.xpath(STYLE_BEARING_NODE).last
ooxml.document? and ooxml = ooxml.root
ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
dir = "left"
alignnode["style"]&.include?("text-align:right") and dir = "right"
ooxml.name == "oMathPara" or
ooxml.wrap("")
ooxml.elements.first.previous =
""
ooxml
end
def uncenter_unneeded(math, ooxml, alignnode)
(math_block?(ooxml, math) || !alignnode) and return ooxml
math_only_para?(alignnode) and return nil
ooxml.name == "oMathPara" and
ooxml = ooxml.elements.select { |x| %w(oMath r).include?(x.name) }
ooxml.size > 1 ? nil : Nokogiri::XML::NodeSet.new(math.document, ooxml)
end
end