require 'set' $warned_nokogiri = false module MaRuKu HTML_INLINE_ELEMS = Set.new %w[a abbr acronym audio b bdi bdo big br button canvas caption cite code col colgroup command datalist del details dfn dir em fieldset font form i img input ins kbd label legend mark meter optgroup option progress q rp rt ruby s samp select small source span strike strong sub summary sup tbody td tfoot th thead time tr track tt u var video wbr animate animateColor animateMotion animateTransform circle clipPath defs desc ellipse feGaussianBlur filter font-face font-face-name font-face-src foreignObject g glyph hkern linearGradient line marker mask metadata missing-glyph mpath path pattern polygon polyline radialGradient rect set stop svg switch text textPath title tspan use annotation annotation-xml maction math menclose merror mfrac mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder munderover none semantics] # Parse block-level markdown elements in these HTML tags BLOCK_TAGS = Set.new %w[div section] # This gets mixed into HTML MDElement nodes to hold the parsed document fragment module HTMLElement attr_accessor :parsed_html end # This is just a factory, not an actual class module HTMLFragment # HTMLFragment.new produces a concrete HTMLFragment implementation # that is either a NokogiriHTMLFragment or a REXMLHTMLFragment. def self.new(raw_html) if !$warned_nokogiri && MaRuKu::Globals[:html_parser] == 'nokogiri' begin require 'nokogiri' return NokogiriHTMLFragment.new(raw_html) rescue LoadError warn "Nokogiri could not be loaded. Falling back to REXML." $warned_nokogiri = true end end require 'rexml/document' REXMLHTMLFragment.new(raw_html) end end # Nokogiri backend for HTML handling class NokogiriHTMLFragment def initialize(raw_html) # Wrap our HTML in a dummy document with a doctype (just # for the entity references) wrapped = ' ' + raw_html.strip + '' d = Nokogiri::XML::Document.parse(wrapped) {|c| c.nonet } @fragment = d.root end # @return The name of the first child element in the fragment. def first_node_name first_child = @fragment.children.first first_child ? first_child.name : nil end # Add a class to the children of this fragment def add_class(class_name) @fragment.children.each do |c| c['class'] = ((c['class']||'').split(' ') + [class_name]).join(' ') end end # Process markdown within the contents of some elements and # replace their contents with the processed version. # # @param doc [MaRuKu::MDDocument] A document to process. def process_markdown_inside_elements(doc) # find span elements or elements with 'markdown' attribute elts = @fragment.css("[markdown]") d = @fragment.children.first if d && HTML_INLINE_ELEMS.include?(d.name) elts << d unless d.attribute('markdown') elts += span_descendents(d) end elts.each do |e| how = e['markdown'] e.remove_attribute('markdown') next if "0" == how # user requests no markdown parsing inside parse_blocks = (how == 'block') || BLOCK_TAGS.include?(e.name) # Select all text children of e e.xpath("./text()").each do |original_text| s = MaRuKu::Out::HTML.escapeHTML(original_text.text) unless s.strip.empty? parsed = parse_blocks ? doc.parse_text_as_markdown(s) : doc.parse_span(s) # restore leading and trailing spaces padding = /\A(\s*).*?(\s*)\z/.match(s) parsed = [padding[1]] + parsed + [padding[2]] if padding el = doc.md_el(:dummy, parsed) # Nokogiri collapses consecutive Text nodes, so replace it by a dummy element guard = Nokogiri::XML::Element.new('guard', @fragment) original_text.replace(guard) el.children_to_html.each do |x| guard.before(x.to_s) end guard.remove end end end end # Convert this fragment to an HTML or XHTML string. # @return [String] def to_html output_options = Nokogiri::XML::Node::SaveOptions::DEFAULT_XHTML ^ Nokogiri::XML::Node::SaveOptions::FORMAT @fragment.children.inject("") do |out, child| out << child.serialize(:save_with => output_options, :encoding => 'UTF-8') end end private # Get all span-level descendents of the given element, recursively, # as a flat NodeSet. # # @param e [Nokogiri::XML::Node] An element. # @return [Nokogiri::XML::NodeSet] def span_descendents(e) ns = Nokogiri::XML::NodeSet.new(Nokogiri::XML::Document.new) e.element_children.inject(ns) do |descendents, c| if HTML_INLINE_ELEMS.include?(c.name) descendents << c descendents += span_descendents(c) end descendents end end end # An HTMLFragment implementation using REXML class REXMLHTMLFragment def initialize(raw_html) wrapped = ' ' + raw_html.strip + '' @fragment = REXML::Document.new(wrapped).root end # The name of the first element in the fragment def first_node_name first_child = @fragment.children.first (first_child && first_child.respond_to?(:name)) ? first_child.name : nil end # Add a class to the children of this fragment def add_class(class_name) @fragment.each_element do |c| c.attributes['class'] = ((c.attributes['class']||'').split(' ') + [class_name]).join(' ') end end # Process markdown within the contents of some elements and # replace their contents with the processed version. def process_markdown_inside_elements(doc) elts = [] @fragment.each_element('//*[@markdown]') do |e| elts << e end d = @fragment.children.first if d && HTML_INLINE_ELEMS.include?(first_node_name) elts << d unless d.attributes['markdown'] elts += span_descendents(d) end # find span elements or elements with 'markdown' attribute elts.each do |e| # should we parse block-level or span-level? how = e.attributes['markdown'] e.attributes.delete('markdown') next if "0" == how # user requests no markdown parsing inside parse_blocks = (how == 'block') || BLOCK_TAGS.include?(e.name) # Select all text children of e e.texts.each do |original_text| s = MaRuKu::Out::HTML.escapeHTML(original_text.value) unless s.strip.empty? # TODO extract common functionality parsed = parse_blocks ? doc.parse_text_as_markdown(s) : doc.parse_span(s) # restore leading and trailing spaces padding = /\A(\s*).*?(\s*)\z/.match(s) parsed = [padding[1]] + parsed + [padding[2]] if padding el = doc.md_el(:dummy, parsed) new_html = "" el.children_to_html.each do |x| new_html << x.to_s end new_html << "" newdoc = REXML::Document.new(new_html).root p = original_text.parent newdoc.children.each do |c| p.insert_before(original_text, c) end p.delete(original_text) end end end end def to_html formatter = REXML::Formatters::Default.new(true) @fragment.children.inject("") do |out, child| out << formatter.write(child, '') end end private # Get all span-level descendents of the given element, recursively, # as an Array. # # @param e [REXML::Element] An element. # @return [Array] def span_descendents(e) descendents = [] e.each_element do |c| name = c.respond_to?(:name) ? c.name : nil if name && HTML_INLINE_ELEMS.include?(c.name) descendents << c descendents += span_descendents(c) end end end end end