class Html2Doc
NOKOHEAD = <<~HERE.freeze
HERE
def to_xhtml(xml)
xml.gsub!(/<\?xml[^<>]*>/, "")
unless /' + xml
end
xml = xml.gsub(/")
.gsub(//, "")
Nokogiri::XML.parse(xml)
end
DOCTYPE = <<~DOCTYPE.freeze
DOCTYPE
def from_xhtml(xml)
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
.gsub(//, "/, "")
.gsub("\n-->\n", "\n-->\n")
end
def msword_fix(doc)
# brain damage in MSWord parser
doc.gsub!(%r{},
"")
doc.gsub!(%r{},
'')
doc.gsub!(%r{},
'')
doc.gsub!(%r{(")
doc.gsub!(%r{}, "/>")
doc.gsub!(%r{>}, "/>")
doc.gsub!(%r{>}, "/>")
doc.gsub!(%r{>}, "/>")
doc.gsub!(%r{>}, "/>")
doc.gsub!(%r{>}, "/>")
doc.gsub!(%r{>}, "/>")
doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
doc.gsub!(%r{&tab;|&tab;},
' ')
doc.split(%r{(|)}).each_slice(4).map do |a|
a.size > 2 and a[2] = a[2].gsub(/>\s+, "><")
a
end.join
end
PRINT_VIEW = <<~XML.freeze
Print
100
XML
def namespace(root)
{ o: "urn:schemas-microsoft-com:office:office",
w: "urn:schemas-microsoft-com:office:word",
v: "urn:schemas-microsoft-com:vml",
m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
end
def rootnamespace(root)
root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end
end