lib/html2doc/math.rb in html2doc-1.1.1 vs lib/html2doc/math.rb in html2doc-1.1.2

- old
+ new

@@ -23,17 +23,22 @@ def self.asciimath_to_mathml(doc, delims) return doc if delims.nil? || delims.size < 2 m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/) m.each_slice(4).map.with_index do |(*a), i| - i % 500 == 0 && m.size > 1000 && i > 0 and - warn "MathML #{i} of #{(m.size / 4).floor}" + progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath") a[2].nil? || a[2] = asciimath_to_mathml1(a[2]) a.size > 1 ? a[0] + a[2] : a[0] end.join end + def self.progress_conv(idx, step, total, threshold, msg) + return unless (idx % step).zero? && total > threshold && idx.positive? + + warn "#{msg} #{idx} of #{total}" + end + def self.unwrap_accents(doc) doc.xpath("//*[@accent = 'true']").each do |x| x.elements.length > 1 or next x.elements[1].name == "mrow" and x.elements[1].replace(x.elements[1].children) @@ -67,22 +72,24 @@ x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;") end math end + HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze + def self.unitalic(math) math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x| - x.wrap("<span style='font-style:normal;'></span>") + x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>") end math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x| - x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>") + x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>") end math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x| - x.wrap("<span class='nostem'><em></em></span>") + x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>") end math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x| - x.wrap("<span style='font-style:normal;font-weight:bold;'></span>") + x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>") end math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x| to_plane1(x, :monospace) end math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x| @@ -126,24 +133,34 @@ def self.mathml_to_ooml(docxml) docnamespaces = docxml.collect_namespaces m = docxml.xpath("//*[local-name() = 'math']") m.each_with_index do |x, i| - i % 100 == 0 && m.size > 500 && i > 0 and - warn "Math OOXML #{i} of #{m.size}" - element = ooxml_cleanup(x, docnamespaces) - doc = Nokogiri::XML::Document::new - doc.root = element - ooxml = unitalic(esc_space(@xsltemplate.transform(doc))).to_s - .gsub(/<\?[^>]+>\s*/, "") - .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "") - .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2") - ooxml = uncenter(x, ooxml) - x.swap(ooxml) + progress_conv(i, 100, m.size, 500, "Math OOXML") + mathml_to_ooml1(x, docnamespaces) end end + # We need span and em not to be namespaced. Word can't deal with explicit + # namespaces. + # We will end up stripping them out again under Nokogiri 1.11, which correctly + # insists on inheriting namespace from parent. + def self.ooml_clean(xml) + xml.to_s + .gsub(/<\?[^>]+>\s*/, "") + .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "") + .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2") + end + + def self.mathml_to_ooml1(xml, docnamespaces) + doc = Nokogiri::XML::Document::new + doc.root = ooxml_cleanup(xml, docnamespaces) + ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc)))) + ooxml = uncenter(xml, ooxml) + xml.swap(ooxml) + end + # escape space as &#x32;; we are removing any spaces generated by # XML indentation def self.esc_space(xml) xml.traverse do |n| next unless n.text? @@ -155,10 +172,10 @@ # if oomml has no siblings, by default it is centered; override this with # left/right if parent is so tagged def self.uncenter(math, ooxml) alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\ - "local-name() = 'div' or local-name() = 'td']/@style") + "local-name() = 'div' or local-name() = 'td']/@style") return ooxml unless alignnode && (math.next == nil && math.previous == nil) %w(left right).each do |dir| if alignnode.text.include? ("text-align:#{dir}") ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\