lib/html2doc/math.rb in html2doc-1.3.1 vs lib/html2doc/math.rb in html2doc-1.4.0

- old
+ new

@@ -2,83 +2,79 @@ require "asciimath" require "htmlentities" require "nokogiri" require "plane1converter" -module Html2Doc - @xsltemplate = - Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"), - encoding: "utf-8")) - - def self.asciimath_to_mathml1(expr) +class Html2Doc + def asciimath_to_mathml1(expr) AsciiMath::MathMLBuilder.new(msword: true).append_expression( AsciiMath.parse(HTMLEntities.new.decode(expr)).ast, ).to_s .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") rescue StandardError => e puts "parsing: #{expr}" puts e.message raise e end - def self.asciimath_to_mathml(doc, delims) + def asciimath_to_mathml(doc, delims) return doc if delims.nil? || delims.size < 2 m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/) m.each_slice(4).map.with_index do |(*a), i| progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath") a[2].nil? || a[2] = asciimath_to_mathml1(a[2]) a.size > 1 ? a[0] + a[2] : a[0] end.join end - def self.progress_conv(idx, step, total, threshold, msg) + def progress_conv(idx, step, total, threshold, msg) return unless (idx % step).zero? && total > threshold && idx.positive? warn "#{msg} #{idx} of #{total}" end - def self.unwrap_accents(doc) + def unwrap_accents(doc) doc.xpath("//*[@accent = 'true']").each do |x| x.elements.length > 1 or next x.elements[1].name == "mrow" and x.elements[1].replace(x.elements[1].children) end doc end # random fixes to MathML input that OOXML needs to render properly - def self.ooxml_cleanup(math, docnamespaces) + def ooxml_cleanup(math, docnamespaces) math = unwrap_accents( mathml_preserve_space( mathml_insert_rows(math, docnamespaces), docnamespaces ), ) math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML") math end - def self.mathml_insert_rows(math, docnamespaces) + def mathml_insert_rows(math, docnamespaces) math.xpath(%w(msup msub msubsup munder mover munderover) .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x| next unless x.next_element && x.next_element != "mrow" x.next_element.wrap("<mrow/>") end math end - def self.mathml_preserve_space(math, docnamespaces) + def mathml_preserve_space(math, docnamespaces) math.xpath(".//xmlns:mtext", docnamespaces).each do |x| x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;") end math end HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze - def self.unitalic(math) + def unitalic(math) math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x| x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>") end math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x| x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>") @@ -120,20 +116,20 @@ to_plane1(x, :sansbolditalic) end math end - def self.to_plane1(xml, font) + def to_plane1(xml, font) xml.traverse do |n| next unless n.text? n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font)) end xml end - def self.mathml_to_ooml(docxml) + def mathml_to_ooml(docxml) docnamespaces = docxml.collect_namespaces m = docxml.xpath("//*[local-name() = 'math']") m.each_with_index do |x, i| progress_conv(i, 100, m.size, 500, "Math OOXML") mathml_to_ooml1(x, docnamespaces) @@ -142,55 +138,55 @@ # We need span and em not to be namespaced. Word can't deal with explicit # namespaces. # We will end up stripping them out again under Nokogiri 1.11, which correctly # insists on inheriting namespace from parent. - def self.ooml_clean(xml) + def ooml_clean(xml) xml.to_s .gsub(/<\?[^>]+>\s*/, "") .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "") .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2") end - def self.mathml_to_ooml1(xml, docnamespaces) + def mathml_to_ooml1(xml, docnamespaces) doc = Nokogiri::XML::Document::new doc.root = ooxml_cleanup(xml, docnamespaces) ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc))))) ooxml = uncenter(xml, ooxml) xml.swap(ooxml) end - def self.accent_tr(xml) + def accent_tr(xml) xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x| x["m:val"] &&= accent_tr1(x["m:val"]) x["val"] &&= accent_tr1(x["val"]) end xml end - def self.accent_tr1(accent) + def accent_tr1(accent) case accent when "\u2192" then "\u20D7" when "^" then "\u0302" when "~" then "\u0303" else accent end end # escape space as &#x32;; we are removing any spaces generated by # XML indentation - def self.esc_space(xml) + def esc_space(xml) xml.traverse do |n| next unless n.text? n = n.text.gsub(/ /, "&#x32;") end xml end # if oomml has no siblings, by default it is centered; override this with # left/right if parent is so tagged - def self.uncenter(math, ooxml) + def uncenter(math, ooxml) alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\ "local-name() = 'div' or local-name() = 'td']/@style") return ooxml unless alignnode && (math.next == nil && math.previous == nil) %w(left right).each do |dir|