# encoding: utf-8 require 'nokogiri' require 'cgi' # # Generates a table of contents for any HTML markup, and adds anchors to headings. # module Amber::Render ## ## TABLE OF CONTENTS ## class TableOfContents # # options: # :content_selector (css selector for headings, nokogiri backend only) # :href_base -- use this href for the toc links # :numeric_prefix -- prefix toc entries and headings with numeric counter (e.g. 1.1.0, 1.2.0, ...) # def initialize(html, options = {}) @html = html @toc = TocItem.new @levels = {"h1" => 0, "h2" => 0, "h3" => 0, "h4" => 0} @heading_anchors = {} @options = options @options[:tag] ||= 'ol' @parsed = nil end def to_html parse_doc unless @parsed # override this! end def to_toc parse_doc unless @parsed # override this! end private def parse_doc each_heading(@html) do |heading, heading_text| heading_anchor = anchor_text(heading_text) heading_text = strip_anchors(heading_text) if @options[:numeric_prefix] increment_level(heading) heading_text = level_text + " " + heading_text end @toc.add_heading(heading, heading_text, heading_anchor) '%s' % [heading_anchor, heading_text] end @parsed = true end # # returns anchor text from heading text. # e.g. First Heading! => first-heading # # if there are duplicates, they get numbered: # heading => heading # heading => heading-2 # heading => heading-3 # def anchor_text(heading_text) text = nameize(strip_html_tags(heading_text)) text_with_suffix = text i = 2 while @heading_anchors[text_with_suffix] text_with_suffix = "#{text}-#{i}" i+=1 end @heading_anchors[text_with_suffix] = true text_with_suffix end # # convert any string to one suitable for a url. # resist the urge to translit non-ascii slugs to ascii. # it is always much better to keep strings as utf8. # def nameize(str) str = str.dup str.gsub!(/&(\w{2,6}?|#[0-9A-Fa-f]{2,6});/,'') # remove html entitities str.gsub!(/[^- [[:word:]]]/u, '') # remove non-word characters (using unicode definition of a word char) str.strip! str.downcase! # upper case characters in urls are confusing str.gsub!(/\ +/u, '-') # spaces to dashes, preferred separator char everywhere CGI.escape(str) end # removes all html markup def strip_html_tags(html) Nokogiri::HTML::DocumentFragment.parse(html, 'UTF-8').children.collect{|child| child.inner_text}.join end # remove from html, but leaves all other tags in place. def strip_anchors(html) Nokogiri::HTML::DocumentFragment.parse(html, 'UTF-8').children.collect{|child| if child.name == "text" child.inner_text elsif child.name != 'a' || !child.attributes.detect{|atr| atr[0] == 'name'} child.to_s end }.join end # # prefix headings with text like 1.2.1, if :numeric_prefix => true # def level_text [@levels["h1"], @levels["h2"], @levels["h3"], @levels["h4"]].join(".").gsub(/\.0/, "") end # # keeps a counter of the latest heading at each level # def increment_level(heading) @levels[heading] += 1 @levels["h2"] = 0 if heading == "h1" @levels["h3"] = 0 if heading == "h1" || heading == "h2" @levels["h4"] = 0 if heading == "h1" || heading == "h2" || heading == "h3" end def each_heading(html, &block) raise 'override me' end end ## ## NOKOGIRI TOC ## class NokogiriTableOfContents < TableOfContents def to_html super @nokogiri_doc.to_html.gsub(/()\n/, '\1').gsub(/\n(<\/h\d.*?>)/, '\1') end def to_toc super ul = Nokogiri::XML::Node.new(@options[:tag], Nokogiri::HTML.fragment("")) @toc.populate_node(ul, @options) ul.to_pretty_html end private def each_heading(html, &block) @nokogiri_doc = Nokogiri::HTML.fragment(html, "UTF-8") if @options[:content_selector] selector = @levels.keys.map {|h| "#{@options[:content_selector]} #{h}" }.join(",") else selector = @levels.keys.join(",") end @nokogiri_doc.css(selector).each do |node| node.inner_html = yield(node.name, node.inner_html) end end end ## ## REGEX TOC ## class RegexTableOfContents < TableOfContents def to_html super @new_html end def to_toc super @toc.to_html(@options) end private HEADING_EX = %r{ <\s*((h\d).*?)\s*> # match starting

(.+)? # match innner text <\s\/\2\s> # match closing

}x def each_heading(html, &block) @new_html = html.gsub(HEADING_EX) do |match| "<%s>%s" % [$1, yield($2, $3), $2] end end end ## ## TOC ITEM ## ## A tree of TocItems composes the table of contents outline. ## class TocItem attr_reader :children, :level, :text, :anchor def initialize(heading='h0', text=nil, anchor=nil) @level = heading[1].to_i if heading.is_a?(String) @text = text @anchor = anchor @children = [] end def add_heading(heading, heading_text, heading_anchor) self.parent_for(heading).children << TocItem.new(heading, heading_text, heading_anchor) end # # generates nokogiri html node tree from this toc # def populate_node(node, options) @children.each do |item| li = node.document.create_element("li") li.add_child(li.document.create_element("a", item.text, :href => "#{options[:href_base]}##{item.anchor}")) if item.children.any? ul = li.document.create_element(options[:tag]) item.populate_node(ul, options) li.add_child(ul) end node.add_child(li) end end # # generates html string from this toc # def to_html(options={}) html = [] tag = options[:tag] indent = options[:indent] || 0 str = options[:indent_str] || " " html << '%s<%s>' % [(str*indent), tag] @children.each do |item| html << '%s

' % (str*(indent+1)) html << '%s%s' % [str*(indent+2), options[:href_base], item.anchor, item.text] if item.children.any? html << item.to_html({ :indent => indent+2, :indent_str => str, :tag => tag, :href_base => options[:href_base] }) end html << '%s

' % (str*(indent+1)) end html << '%s' % [(str*indent), tag] html.join("\n") end # # Returns the appropriate TocItem for appending a new item # at a particular heading level. # def parent_for(heading) heading = heading[1].to_i if heading.is_a?(String) if children.any? && children.last.level < heading children.last.parent_for(heading) else self end end end end class Nokogiri::XML::Node def to_pretty_html(indent=0) indent_str = " " * indent children_html = [] text_html = nil if children.size == 1 && children.first.name == "text" text_html = children.first.content else children.each do |child| if child.name == "text" children_html << "#{" " * (indent+1)}#{child.content}" if !child.content.empty? else children_html << child.to_pretty_html(indent+1) end end end attrs = [] attributes.each do |attribute| attrs << %(#{attribute[0]}="#{attribute[1]}") end if attrs.any? attr_html = " " + attrs.join(' ') else attr_html = "" end html = [] if text_html html << "#{indent_str}<#{name}#{attr_html}>#{text_html}" elsif children_html.any? html << "#{indent_str}<#{name}#{attr_html}>" html += children_html html << "#{indent_str}" else html << "#{indent_str}<#{name}#{attr_html}>" end html.join("\n") end end =begin AN ATTEMPT TO GET NOKOGIRI TO OUTPUT REASONABLE HTML5. NO LUCK. # # convert a Nokogiri::HTML::Document into well formatted html. # unfortunately, Nokogiri formatting only works on complete documents, so we strip away the tags. :( # def format_doc(doc) INDENT_XSLT.apply_to(doc).to_s.sub("\n", '').sub('', '') end # from https://github.com/jarijokinen/html5-beautifier/blob/master/lib/html5-beautifier/xslt/html5-beautifier.xslt # MIT License INDENT_XSLT_STRING = < <!DOCTYPE html>

EOF INDENT_XSLT = Nokogiri::XSLT(INDENT_XSLT_STRING.gsub('__INDENT_STRING__', ' ')) =end

(.+)? # match innner text <\s*\/\2\s*> # match closing

(.+)? # match innner text <\s\/\2\s> # match closing