require "uuidtools"
require "asciimath"
require "htmlentities"
require "nokogiri"
require "xml/xslt"
require "pp"
require "fileutils"
module Html2Doc
def self.process(result, hash)
hash[:dir1] = create_dir(hash[:filename], hash[:dir])
result = process_html(result, hash)
process_header(hash[:header_file], hash)
generate_filelist(hash[:filename], hash[:dir1])
File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
mime_package result, hash[:filename], hash[:dir1]
rm_temp_files(hash[:filename], hash[:dir], hash[:dir1])
end
def self.process_header(headerfile, hash)
return if headerfile.nil?
doc = File.read(headerfile, encoding: "utf-8")
doc = header_image_cleanup(doc, hash[:dir1], hash[:filename])
File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
end
def self.create_dir(filename, dir)
return dir if dir
dir = "#{filename}_files"
Dir.mkdir(dir) unless File.exists?(dir)
dir
end
def self.process_html(result, hash)
docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
define_head(cleanup(docxml, hash), hash)
msword_fix(from_xhtml(docxml))
end
def self.rm_temp_files(filename, dir, dir1)
FileUtils.rm "#{filename}.htm"
FileUtils.rm_f "#{dir1}/header.html"
FileUtils.rm_r dir1 unless dir
end
def self.cleanup(docxml, hash)
image_cleanup(docxml, hash[:dir1])
mathml_to_ooml(docxml)
lists(docxml, hash[:liststyles])
footnotes(docxml)
bookmarks(docxml)
msonormal(docxml)
docxml
end
NOKOHEAD = <<~HERE.freeze
HERE
def self.to_xhtml(xml)
xml.gsub!(/<\?xml[^>]*>/, "")
unless /' + xml
end
Nokogiri::XML.parse(xml)
end
DOCTYPE = <<~"DOCTYPE".freeze
DOCTYPE
def self.from_xhtml(xml)
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
sub(DOCTYPE, "").
gsub(%{ />}, "/>")
end
def self.msword_fix(r)
# brain damage in MSWord parser
r.gsub!(%r{},
'')
r.gsub!(%r{},
'')
r.gsub!(%r{(")
r.gsub!(%r{}, "/>")
r.gsub!(%r{&tab;|&tab;}, ' ')
r
end
PRINT_VIEW = <<~XML.freeze
XML
def self.define_head1(docxml, dir)
docxml.xpath("//*[local-name() = 'head']").each do |h|
h.children.first.add_previous_sibling <<~XML
#{PRINT_VIEW}
XML
end
end
def self.filename_substitute(stylesheet, header_filename, filename)
if header_filename.nil?
stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
else
stylesheet.gsub!(/FILENAME/, filename)
end
stylesheet
end
def self.stylesheet(filename, header_filename, fn)
(fn.nil? || fn.empty?) &&
fn = File.join(File.dirname(__FILE__), "wordstyle.css")
stylesheet = File.read(fn, encoding: "UTF-8")
stylesheet = filename_substitute(stylesheet, header_filename, filename)
xml = Nokogiri::XML("")
xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
xml.root.to_s
end
def self.define_head(docxml, hash)
title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
head = docxml.at("//*[local-name() = 'head']")
css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
add_stylesheet(head, title, css)
define_head1(docxml, hash[:dir1])
namespace(docxml.root)
end
def self.add_stylesheet(head, title, css)
if head.children.empty?
head.add_child css
elsif title.nil?
head.children.first.add_previous_sibling css
else
title.add_next_sibling css
end
end
def self.namespace(root)
{
o: "urn:schemas-microsoft-com:office:office",
w: "urn:schemas-microsoft-com:office:word",
v: "urn:schemas-microsoft-com:vml",
m: "http://schemas.microsoft.com/office/2004/12/omml",
}.each { |k, v| root.add_namespace_definition(k.to_s, v) }
root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end
def self.bookmarks(docxml)
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]").each do |x|
next if x["id"].empty?
if x.children.empty?
x.add_child("")
else
x.children.first.previous = ""
end
x.delete("id")
end
end
def self.msonormal(docxml)
docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
p["class"] = "MsoNormal"
end
docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
p["class"] = "MsoNormal"
end
end
end