class Html2Doc NOKOHEAD = <<~HERE.freeze HERE def to_xhtml(xml) xml.gsub!(/<\?xml[^<>]*>/, "") unless /' + xml end xml = xml.gsub(/") .gsub(//, "") Nokogiri::XML.parse(xml) end DOCTYPE = <<~DOCTYPE.freeze DOCTYPE def from_xhtml(xml) xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "") .sub(DOCTYPE, "").gsub(%{ />}, "/>") .gsub(//, "/, "") .gsub("\n-->\n", "\n-->\n") end def msword_fix(doc) # brain damage in MSWord parser doc.gsub!(%r{}, "") doc.gsub!(%r{}, '') doc.gsub!(%r{
}, '
') doc.gsub!(%r{(") doc.gsub!(%r{}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{>}, "/>") doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2") doc.gsub!(%r{&tab;|&tab;}, '  ') doc.split(%r{(|)}).each_slice(4).map do |a| a.size > 2 and a[2] = a[2].gsub(/>\s+<") a end.join end PRINT_VIEW = <<~XML.freeze Print 100 XML def namespace(root) { o: "urn:schemas-microsoft-com:office:office", w: "urn:schemas-microsoft-com:office:word", v: "urn:schemas-microsoft-com:vml", m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) } end def rootnamespace(root) root.add_namespace(nil, "http://www.w3.org/TR/REC-html40") end end