require "base64" def html_input(x) <<~HTML blank #{x} HTML end def html_input_no_title(x) <<~HTML #{x} HTML end def html_input_empty_head(x) <<~HTML #{x} HTML end WORD_HDR = <<~HDR MIME-Version: 1.0 Content-Type: multipart/related; boundary="----=_NextPart_" ------=_NextPart_ Content-Location: file:///C:/Doc/test.htm Content-Type: text/html; charset="utf-8" blank HDR def word_body(x, fn) <<~BODY #{x} #{fn} BODY end WORD_FTR1 = <<~FTR ------=_NextPart_ Content-Location: file:///C:/Doc/test_files/filelist.xml Content-Transfer-Encoding: base64 Content-Type: #{Html2Doc::mime_type('filelist.xml')} PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp bGVsaXN0LnhtbCIvPgo8L3htbD4K ------=_NextPart_-- FTR WORD_FTR2 = <<~FTR ------=_NextPart_ Content-Location: file:///C:/Doc/test_files/filelist.xml Content-Transfer-Encoding: base64 Content-Type: #{Html2Doc::mime_type('filelist.xml')} PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp bGVsaXN0LnhtbCIvPgogIDxvOkZpbGUgSFJlZj0iaGVhZGVyLmh0bWwiLz4KPC94bWw+Cg== ------=_NextPart_ FTR WORD_FTR3 = <<~FTR ------=_NextPart_ Content-Location: file:///C:/Doc/test_files/filelist.xml Content-Transfer-Encoding: base64 Content-Type: #{Html2Doc::mime_type('filelist.xml')} PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9IjFh YzIwNjVmLTAzZjAtNGM3YS1iOWE2LTkyZTgyMDU5MWJmMC5wbmciLz4KICA8bzpGaWxlIEhSZWY9 ImZpbGVsaXN0LnhtbCIvPgo8L3htbD4K ------=_NextPart_ Content-Location: file:///C:/Doc/test_files/cb7b0d19-891e-4634-815a-570d019d454c.png Content-Transfer-Encoding: base64 Content-Type: image/png ------=_NextPart_-- FTR HEADERHTML = <<~FTR





ISO/IEC&nbsp;CD 17301-1:2016(E)

© ISO/IEC&nbsp;2016&nbsp;– All rights reserved

2                                                                                                                                                                           © ISO/IEC&nbsp;2016&nbsp;– All rights reserved

ISO/IEC&nbsp;CD 17301-1:2016(E)

ISO/IEC&nbsp;CD 17301-1:2016(E)

ii                                                                                                                                                                           © ISO/IEC&nbsp;2016&nbsp;– All rights reserved

© ISO/IEC&nbsp;2016&nbsp;– All rights reserved                                                                                                                                                                          iii

2                                                                                                                                                                           © ISO/IEC&nbsp;2016&nbsp;– All rights reserved

© ISO/IEC&nbsp;2016&nbsp;– All rights reserved                                                                                                                                                                           3

FTR ASCII_MATH='i=1ni3=nn+122' DEFAULT_STYLESHEET = File.read("lib/html2doc/wordstyle.css", encoding: "utf-8").freeze def guid_clean(x) x.gsub(/NextPart_[0-9a-f.]+/, "NextPart_") end def image_clean(x) x.gsub(%r{[0-9a-f-]+\.png}, "image.png"). gsub(%r{[0-9a-f-]+\.gif}, "image.gif"). gsub(%r{[0-9a-f-]+\.(jpeg|jpg)}, "image.jpg"). gsub(%r{------=_NextPart_\s+Content-Location: file:///C:/Doc/test_files/image\.(png|gif).*?\s-----=_NextPart_}m, "------=_NextPart_"). gsub(%r{Content-Type: image/(png|gif|jpeg)[^-]*------=_NextPart_-?-?}m, ""). gsub(%r{ICAgICAg[^-]*-----}m, "-----"). gsub(%r{\s*\s*}m, ""). gsub(%r{\s*}m, "") end RSpec.describe Html2Doc do it "has a version number" do expect(Html2Doc::VERSION).not_to be nil end it "processes a blank document" do Html2Doc.process(html_input(""), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body("", '
')} #{WORD_FTR1} OUTPUT end it "removes any temp files" do File.delete("test.doc") Html2Doc.process(html_input(""), filename: "test") expect(File.exist?("test.doc")).to be true expect(File.exist?("test.htm")).to be false expect(File.exist?("test_files")).to be false end it "processes a stylesheet in an HTML document with a title" do Html2Doc.process(html_input(""), filename: "test", stylesheet: "lib/html2doc/wordstyle.css") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body("", '
')} #{WORD_FTR1} OUTPUT end it "processes a stylesheet in an HTML document without a title" do Html2Doc.process(html_input_no_title(""), filename: "test", stylesheet: "lib/html2doc/wordstyle.css") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR.sub("blank", "")} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body("", '
')} #{WORD_FTR1} OUTPUT end it "processes a stylesheet in an HTML document with an empty head" do Html2Doc.process(html_input_empty_head(""), filename: "test", stylesheet: "lib/html2doc/wordstyle.css") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR.sub("blank", "")} #{DEFAULT_STYLESHEET} #{WORD_HDR_END.sub(''+"\n", "").sub("\n", "")} #{word_body("", '
')} #{WORD_FTR1} OUTPUT end it "processes a header" do Html2Doc.process(html_input(""), filename: "test", header_file: "spec/header.html") html = guid_clean(File.read("test.doc", encoding: "utf-8")) hdr = Base64.decode64(html.sub(%r{^.*Content-Location: file:///C:/Doc/test_files/header.html}, ""). sub(%r{^.*Content-Type: text/html charset="utf-8"}m, ""). sub(%r{------=_NextPart_--.*$}m, "")).force_encoding("UTF-8") #expect(hdr.gsub(/\xa0/, " ")).to match_fuzzy(HEADERHTML) expect(HTMLEntities.new.encode(hdr, :hexadecimal). gsub(/\</, "<").gsub(/\>/, ">").gsub(/\'/, "'").gsub(/\"/, '"'). gsub(/\ /, " ").gsub(/\ /, "\n")).to match_fuzzy(HEADERHTML) expect(html.sub(%r{Content-Location: file:///C:/Doc/test_files/header.html.*$}m, "")). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET.gsub(/FILENAME/, "test")} #{WORD_HDR_END} #{word_body("", '
')} #{WORD_FTR2} OUTPUT end it "processes a header with an image" do Html2Doc.process(html_input(""), filename: "test", header_file: "spec/header_img.html") doc = guid_clean(File.read("test.doc", encoding: "utf-8")) expect(doc).to match(%r{Content-Type: image/png}) expect(doc).to match(%r{file:///C:/Doc/test_files/[^.]+\.png}) end it "processes a header with an image with absolute path" do doc = File.read("spec/header_img.html", encoding: "utf-8") File.open("spec/header_img1.html", "w:UTF-8") do |f| f.write doc.sub(%r{spec/19160-6.png}, File.expand_path(File.join(File.dirname(__FILE__), "19160-6.png"))) end Html2Doc.process(html_input(""), filename: "test", header_file: "spec/header_img1.html") doc = guid_clean(File.read("test.doc", encoding: "utf-8")) expect(doc).to match(%r{Content-Type: image/png}) expect(doc).to match(%r{file:///C:/Doc/test_files/[^.]+\.png}) end it "processes a populated document" do simple_body = "

Hello word!

This is a very simple document
" Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body(simple_body, '
')} #{WORD_FTR1} OUTPUT end it "processes AsciiMath" do Html2Doc.process(html_input(%[
{{sum_(i=1)^n i^3=((n(n+1))/2)^2 text("integer"))}}
]), filename: "test", asciimathdelims: ["{{", "}}"]) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body("
#{ASCII_MATH}\"integer\"
", '
')} #{WORD_FTR1} OUTPUT end it "processes spaces in AsciiMath" do Html2Doc.process(html_input(%[
{{text " integer ")}}
]), filename: "test", asciimathdelims: ["{{", "}}"]) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body("
integer
", '
')} #{WORD_FTR1} OUTPUT end it "left-aligns AsciiMath" do Html2Doc.process(html_input("
{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}
"), filename: "test", asciimathdelims: ["{{", "}}"]) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body("
#{ASCII_MATH}
", '
')} #{WORD_FTR1} OUTPUT end it "right-aligns AsciiMath" do Html2Doc.process(html_input("
{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}
"), filename: "test", asciimathdelims: ["{{", "}}"]) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body("
#{ASCII_MATH}
", '
')} #{WORD_FTR1} OUTPUT end it "wraps msup after munderover in MathML" do Html2Doc.process(html_input("
i=0n2i
"), filename: "test", asciimathdelims: ["{{", "}}"]) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('
i=0n2i
', '
')} #{WORD_FTR1} OUTPUT end it "processes tabs" do simple_body = "

Hello word!

This is a very &tab; simple document
" Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body(simple_body.gsub(/\&tab;/, %[  ]), '
')} #{WORD_FTR1} OUTPUT end it "makes unstyled paragraphs be MsoNormal" do simple_body = '

Hello word!

This is a very simple document

This style stays

' Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body(simple_body.gsub(/

/, %[

]), '

')} #{WORD_FTR1} OUTPUT end it "makes unstyled list entries be MsoNormal" do simple_body = '

Hello word!

  • This is a very simple document
  • This style stays
' Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body(simple_body.gsub(/
  • /, %[
  • ]), '
    ')} #{WORD_FTR1} OUTPUT end it "resizes images for height, in a file in a subdirectory" do simple_body = '' Html2Doc.process(html_input(simple_body), filename: "spec/test") testdoc = File.read("spec/test.doc", encoding: "utf-8") expect(testdoc).to match(%r{Content-Type: image/png}) expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{image_clean(word_body('', '
    '))} #{image_clean(WORD_FTR3)} OUTPUT end it "resizes images for width" do simple_body = '' Html2Doc.process(html_input(simple_body), filename: "test") testdoc = File.read("test.doc", encoding: "utf-8") expect(testdoc).to match(%r{Content-Type: image/gif}) expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{image_clean(word_body('', '
    '))} #{image_clean(WORD_FTR3).gsub(/image\.png/, "image.gif")} OUTPUT end it "resizes images for height" do simple_body = '' Html2Doc.process(html_input(simple_body), filename: "test") testdoc = File.read("test.doc", encoding: "utf-8") expect(testdoc).to match(%r{Content-Type: image/jpeg}) expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{image_clean(word_body('', '
    '))} #{image_clean(WORD_FTR3).gsub(/image\.png/, "image.jpg")} OUTPUT end it "resizes images with missing or auto sizes" do image = { "src" => "spec/19160-8.jpg" } expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [30, 100] image["width"] = "20" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [20, 65] image.delete("width") image["height"] = "50" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [15, 50] image.delete("height") image["width"] = "500" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [30, 100] image.delete("width") image["height"] = "500" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [30, 100] image["width"] = "20" image["height"] = "auto" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [20, 65] image["width"] = "auto" image["height"] = "50" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [15, 50] image["width"] = "500" image["height"] = "auto" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [30, 100] image["width"] = "auto" image["height"] = "500" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [30, 100] image["width"] = "auto" image["height"] = "auto" expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100)).to eq [30, 100] end it "does not move images if they are external URLs" do simple_body = '' Html2Doc.process(html_input(simple_body), filename: "test") testdoc = File.read("test.doc", encoding: "utf-8") expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{image_clean(word_body('', '
    '))} #{image_clean(WORD_FTR1)} OUTPUT end it "deals with absolute image locations" do simple_body = %{} Html2Doc.process(html_input(simple_body), filename: "spec/test") testdoc = File.read("spec/test.doc", encoding: "utf-8") expect(testdoc).to match(%r{Content-Type: image/png}) expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{image_clean(word_body('', '
    '))} #{image_clean(WORD_FTR3)} OUTPUT end =begin it "warns about SVG" do simple_body = '' expect{ Html2Doc.process(html_input(simple_body), filename: "test") }.to output("https://example.com/19160-6.svg: SVG not supported\n").to_stderr end =end it "processes epub:type footnotes" do simple_body = '
    This is a very simple document1 allegedly2
    ' Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('
    This is a very simple document allegedly
    ', '

    Footnote

    Other Footnote

    ')} #{WORD_FTR1} OUTPUT end it "processes class footnotes" do simple_body = '
    This is a very simple document1 allegedly2
    ' Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('
    This is a very simple document allegedly
    ', '

    Footnote

    Other Footnote

    ')} #{WORD_FTR1} OUTPUT end it "processes footnotes with text wrapping the footnote reference" do simple_body = '
    This is a very simple document(1) allegedly2
    ' Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('
    This is a very simple document() allegedly
    ', '

    ()Footnote

    Other Footnote

    ')} #{WORD_FTR1} OUTPUT end it "extracts paragraphs from footnotes" do simple_body = '
    This is a very simple document1 allegedly2

    Other Footnote

    ' Html2Doc.process(html_input(simple_body), filename: "test") expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('
    This is a very simple document allegedly
    ', '

    Footnote

    Other Footnote

    ')} #{WORD_FTR1} OUTPUT end it "labels lists with list styles" do simple_body = <<~BODY
            1. A
            2. B

              B2

            3. C

    BODY Html2Doc.process(html_input(simple_body), filename: "test", liststyles: {ul: "l1", ol: "l2"}) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('

    A

    B

    B2

    C

    ', '
    ')} #{WORD_FTR1} OUTPUT end it "restarts numbering of lists with list styles" do simple_body = <<~BODY
            1. A

            1. A

    BODY Html2Doc.process(html_input(simple_body), filename: "test", liststyles: {ul: "l1", ol: "l2"}) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('

    A

    A

    ', '
    ')} #{WORD_FTR1} OUTPUT end it "labels lists with multiple list styles" do simple_body = <<~BODY
            1. A
            2. B

              B2

            3. C

            1. A
            2. B

              B2

            3. C

            1. A
            2. B

              B2

            3. C

    BODY Html2Doc.process(html_input(simple_body), filename: "test", liststyles: {ul: "l1", ol: "l2", steps: "l3"}) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('

    A

    B

    B2

    C

    A

    B

    B2

    C

    A

    B

    B2

    C

    ', '
    ')} #{WORD_FTR1} OUTPUT end it "replaces id attributes with explicit a@name bookmarks" do simple_body = <<~BODY

    Hello

    BODY Html2Doc.process(html_input(simple_body), filename: "test", liststyles: {ul: "l1", ol: "l2"}) expect(guid_clean(File.read("test.doc", encoding: "utf-8"))). to match_fuzzy(<<~OUTPUT) #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END} #{word_body('

    Hello

    ', '
    ')} #{WORD_FTR1} OUTPUT end it "test image base64 image encoding" do simple_body = '' Html2Doc.process(html_input(simple_body), filename: "spec/test", debug: true) testdoc = File.read("spec/test.doc", encoding: "utf-8") base64_image = testdoc[/image\/png\n\n(.*?)\n\n----/m, 1].gsub!("\n", "") base64_image_basename = testdoc[%r{Content-Location: file:///C:/Doc/test_files/([0-9a-z\-]+)\.png}m, 1] doc_bin_image = Base64.strict_decode64(base64_image) file_bin_image = IO.read("spec/test_files/#{base64_image_basename}.png", mode: "rb") expect(doc_bin_image).to eq file_bin_image FileUtils.rm_rf %w[spec/test_files spec/test.doc spec/test.htm] end end