require "uuidtools" require "asciimath" require "htmlentities" require "nokogiri" require "fileutils" module Html2Doc def self.process(result, hash) hash[:dir1] = create_dir(hash[:filename], hash[:dir]) result = process_html(result, hash) process_header(hash[:header_file], hash) generate_filelist(hash[:filename], hash[:dir1])"#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) } mime_package result, hash[:filename], hash[:dir1] rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug] end def self.process_header(headerfile, hash) return if headerfile.nil? doc =, encoding: "utf-8") doc = header_image_cleanup(doc, hash[:dir1], hash[:filename], File.dirname(hash[:filename]))"#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) } end def self.clear_dir(dir) Dir.foreach(dir) do |f| fn = File.join(dir, f) File.delete(fn) if f != "." && f != ".." end dir end def self.create_dir(filename, dir) dir and return clear_dir(dir) dir = "#{filename}_files" Dir.mkdir(dir) unless File.exists?(dir) clear_dir(dir) end def self.process_html(result, hash) docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims])) define_head(cleanup(docxml, hash), hash) msword_fix(from_xhtml(docxml)) end def self.rm_temp_files(filename, dir, dir1) FileUtils.rm "#{filename}.htm" FileUtils.rm_f "#{dir1}/header.html" FileUtils.rm_r dir1 unless dir end def self.cleanup(docxml, hash) namespace(docxml.root) image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename])) mathml_to_ooml(docxml) lists(docxml, hash[:liststyles]) footnotes(docxml) bookmarks(docxml) msonormal(docxml) docxml end NOKOHEAD = <<~HERE.freeze HERE def self.to_xhtml(xml) xml.gsub!(/<\?xml[^>]*>/, "") unless /' + xml end Nokogiri::XML.parse(xml) end DOCTYPE = <<~"DOCTYPE".freeze DOCTYPE def self.from_xhtml(xml) xml.to_xml.sub(%{ xmlns=""}, "") .sub(DOCTYPE, "") .gsub(%{ />}, "/>") end def self.msword_fix(doc) # brain damage in MSWord parser doc.gsub!(%r{}, '') doc.gsub!(%r{
}, '
') doc.gsub!(%r{(") doc.gsub!(%r{}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{&tab;|&tab;}, '  ') doc.split(%r{(|)}).each_slice(4).map do |a| a.size > 2 and a[2] = a[2].gsub(/>\s+<") a end.join end PRINT_VIEW = <<~XML.freeze XML def self.define_head1(docxml, dir) docxml.xpath("//*[local-name() = 'head']").each do |h| h.children.first.add_previous_sibling <<~XML #{PRINT_VIEW} XML end end def self.filename_substitute(head, header_filename) return if header_filename.nil? head.xpath(".//*[local-name() = 'style']").each do |s| s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m| /FILENAME/.match?(m) ? "url(cid:header.html)" : m end s.replace(s1) end end def self.stylesheet(filename, header_filename, fn) (fn.nil? || fn.empty?) and fn = File.join(File.dirname(__FILE__), "wordstyle.css") stylesheet =, encoding: "UTF-8") xml = Nokogiri::XML("