require 'nokogiri' require 'ronn/utils' module Ronn # Filter for converting HTML to ROFF class RoffFilter include Ronn::Utils # Convert Ronn HTML to roff. # The html input is an HTML fragment, not a complete document def initialize(html_fragment, name, section, tagline, manual = nil, version = nil, date = nil) @buf = [] title_heading name, section, tagline, manual, version, date doc = Nokogiri::HTML.fragment(html_fragment) remove_extraneous_elements! doc normalize_whitespace! doc block_filter doc write "\n" end def to_s @buf.join.gsub(/[ \t]+$/, '') end protected def previous(node) return unless node.respond_to?(:previous) prev = node.previous prev = prev.previous until prev.nil? || prev.elem? prev end def title_heading(name, section, _tagline, manual, version, date) comment "generated with Ronn-NG/v#{Ronn.version}" comment "http://github.com/apjanke/ronn-ng/tree/#{Ronn.revision}" return if name.nil? if manual macro 'TH', %("#{escape(name.upcase)}" "#{section}" "#{date.strftime('%B %Y')}" "#{version}" "#{manual}") else macro 'TH', %("#{escape(name.upcase)}" "#{section}" "#{date.strftime('%B %Y')}" "#{version}") end end def remove_extraneous_elements!(doc) doc.traverse do |node| node.parent.children.delete(node) if node.comment? end end def normalize_whitespace!(node) if node.is_a?(Array) || node.is_a?(Nokogiri::XML::NodeSet) node.to_a.dup.each { |ch| normalize_whitespace! ch } elsif node.text? preceding = node.previous following = node.next content = node.content.gsub(/[\n ]+/m, ' ') if preceding.nil? || block_element?(preceding.name) || preceding.name == 'br' content.lstrip! end if following.nil? || block_element?(following.name) || following.name == 'br' content.rstrip! end if content.empty? node.remove else node.content = content end elsif node.elem? && node.name == 'pre' # stop traversing elsif node.elem? && node.children normalize_whitespace! node.children elsif node.elem? # element has no children elsif node.document? || node.fragment? normalize_whitespace! node.children elsif node.is_a?(Nokogiri::XML::DTD) || node.is_a?(Nokogiri::XML::Comment) # ignore nop else warn 'unexpected node during whitespace normalization: %p', node end end def block_filter(node) return if node.nil? if node.is_a?(Array) || node.is_a?(Nokogiri::XML::NodeSet) node.each { |ch| block_filter(ch) } elsif node.document? || node.fragment? block_filter(node.children) elsif node.text? # This hack is necessary to support mixed-child-type dd's inline_filter(node) elsif node.elem? case node.name when 'html', 'body' block_filter(node.children) when 'div' block_filter(node.children) when 'h1' # discard nop when 'h2' macro 'SH', quote(escape(node.inner_html)) when 'h3' macro 'SS', quote(escape(node.inner_html)) when 'h4', 'h5', 'h6' # Ronn discourages use of this many heading levels, but if they are used, # we should make them legible instead of ignoring them. macro 'SS', quote(escape(node.inner_html)) when 'p' prev = previous(node) if prev && %w[dd li blockquote].include?(node.parent.name) macro 'IP' elsif prev && !%w[h1 h2 h3].include?(prev.name) macro 'P' elsif node.previous&.text? macro 'IP' end inline_filter(node.children) when 'blockquote' prev = previous(node) indent = prev.nil? || !%w[h1 h2 h3].include?(prev.name) macro 'IP', %w["" 4] if indent block_filter(node.children) macro 'IP', %w["" 0] if indent when 'pre' prev = previous(node) indent = prev.nil? || !%w[h1 h2 h3].include?(prev.name) macro 'IP', %w["" 4] if indent macro 'nf' # HACK: strip an initial \n to avoid extra spacing if node.children && node.children[0].text? text = node.children[0].to_s node.children[0].replace(text[1..]) if text.start_with? "\n" end inline_filter(node.children) macro 'fi' macro 'IP', %w["" 0] if indent when 'dl' macro 'TP' block_filter(node.children) when 'dt' prev = previous(node) macro 'TP' unless prev.nil? inline_filter(node.children) write "\n" when 'dd' if node.at('p') block_filter(node.children) else inline_filter(node.children) end write "\n" when 'ol', 'ul' block_filter(node.children) macro 'IP', %w["" 0] when 'li' case node.parent.name when 'ol' macro 'IP', %W["#{node.parent.children.index(node) + 1}." 4] when 'ul' macro 'IP', ['"\(bu"', '4'] else raise "List element found as a child of non-list parent element: #{node.inspect}" end if node.at('p,ol,ul,dl,div') block_filter(node.children) else inline_filter(node.children) end write "\n" when 'span', 'code', 'b', 'strong', 'kbd', 'samp', 'var', 'em', 'i', 'u', 'br', 'a' inline_filter(node) when 'table' macro 'TS' write "allbox;\n" block_filter(node.children) macro 'TE' when 'thead' # Convert to format section and first row tr = node.children[0] header_contents = [] cell_formats = [] tr.children.each do |th| style = th['style'] cell_format = case style when 'text-align:left;' 'l' when 'text-align:right;' 'r' when 'text-align:center;' 'c' else 'l' end header_contents << th.inner_html cell_formats << cell_format end write cell_formats.join(' ') + ".\n" write header_contents.join("\t") + "\n" when 'th' raise 'internal error: unexpected