lib/html2doc/mime.rb in html2doc-0.0.1 vs lib/html2doc/mime.rb in html2doc-0.5.0
- old
+ new
@@ -1,127 +1,9 @@
require "uuidtools"
require "nokogiri"
module Html2Doc
- def self.process(result, filename, header_file, dir)
- docxml = Nokogiri::XML(xhtml(result))
- cleanup(docxml, dir)
- define_head(docxml, dir, filename, header_file)
- result = self.msword_fix(docxml.to_xml)
- system "cp #{header_file} #{dir}/header.html" unless header_file.nil?
- generate_filelist(filename, dir)
- File.open("#{filename}.htm", "w") { |f| f.write(result) }
- mime_package result, filename, dir
- end
-
- def self.cleanup(docxml, dir)
- image_cleanup(docxml, dir)
- msonormal(docxml)
- end
-
- # preserve HTML escapes
- def self.xhtml(result)
- unless /<!DOCTYPE html/.match? result
- result.gsub!(/<\?xml version="1.0"\?>/, "")
- result = "<!DOCTYPE html SYSTEM " +
- "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>" + result
- end
- result
- end
-
- def self.msword_fix(r)
- # brain damage in MSWord parser
- r.gsub!(%r{<span style="mso-special-character:footnote"/>},
- '<span style="mso-special-character:footnote"></span>')
- r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
- r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
- r.gsub!(%r{<meta http-equiv="Content-Type"},
- "<meta http-equiv=Content-Type")
- r.gsub!(%r{&tab;|&tab;},
- '<span style="mso-tab-count:1">  </span>')
- r
- end
-
- def self.image_resize(orig_filename)
- image_size = ImageSize.path(orig_filename).size
- # max width for Word document is 400, max height is 680
- if image_size[0] > 400
- image_size[1] = (image_size[1] * 400 / image_size[0]).ceil
- image_size[0] = 400
- end
- if image_size[1] > 680
- image_size[0] = (image_size[0] * 680 / image_size[1]).ceil
- image_size[1] = 680
- end
- image_size
- end
-
- def self.image_cleanup(docxml, dir)
- docxml.xpath("//*[local-name() = 'img']").each do |i|
- matched = /\.(?<suffix>\S+)$/.match i["src"]
- uuid = UUIDTools::UUID.random_create.to_s
- new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}")
- # presupposes that the image source is local
- system "cp #{i['src']} #{new_full_filename}"
- i["width"], i["height"] = image_resize(i["src"])
- i["src"] = new_full_filename
- end
- docxml
- end
-
- def self.define_head1(docxml, dir)
- docxml.xpath("//*[local-name() = 'head']").each do |h|
- h.children.first.add_previous_sibling <<~XML
- <!--[if gte mso 9]>
- <xml>
- <w:WordDocument>
- <w:View>Print</w:View>
- <w:Zoom>100</w:Zoom>
- <w:DoNotOptimizeForBrowser/>
- </w:WordDocument>
- </xml>
- <![endif]-->
- <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
- <link rel="File-List" href="#{dir}/filelist.xml"/>
- XML
- end
- end
-
- def self.stylesheet(filename, header_filename)
- fn = File.join(File.dirname(__FILE__), "wordstyle.css")
- stylesheet = File.read(fn, encoding: "UTF-8")
- if header_filename.nil?
- stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
- else
- stylesheet.gsub!(/FILENAME/, filename)
- end
- xml = Nokogiri::XML("<style/>")
- xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
- xml.root.to_s
- end
-
- def self.define_head(docxml, dir, filename, header_file)
- title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
- head = docxml.at("//*[local-name() = 'head']")
- if title.nil?
- head.children.first.add_previous_sibling stylesheet(filename, header_file)
- else
- title.add_next_sibling stylesheet(filename, header_file)
- end
- define_head1(docxml, dir)
- namespace(docxml.root)
- end
-
- def self.namespace(root)
- {
- o: "urn:schemas-microsoft-com:office:office",
- w: "urn:schemas-microsoft-com:office:word",
- m: "http://schemas.microsoft.com/office/2004/12/omml",
- }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
- root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
- end
-
def self.mime_preamble(boundary, filename, result)
<<~"PREAMBLE"
MIME-Version: 1.0
Content-Type: multipart/related; boundary="#{boundary}"
@@ -168,30 +50,7 @@
next if item == "." || item == ".." || /^\./.match(item)
mhtml += mime_attachment(boundary, filename, item, dir)
end
mhtml += "--#{boundary}--"
File.open("#{filename}.doc", "w") { |f| f.write mhtml }
- end
-
- def self.generate_filelist(filename, dir)
- File.open(File.join(dir, "filelist.xml"), "w") do |f|
- f.write(<<~"XML")
- <xml xmlns:o="urn:schemas-microsoft-com:office:office">
- <o:MainFile HRef="../#{filename}.htm"/>
- XML
- Dir.foreach(dir) do |item|
- next if item == "." || item == ".." || /^\./.match(item)
- f.write %{ <o:File HRef="#{item}"/>\n}
- end
- f.write("</xml>\n")
- end
- end
-
- def self.msonormal(docxml)
- docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
- p["class"] = "MsoNormal"
- end
- docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
- p["class"] = "MsoNormal"
- end
end
end