# Generates header -> ids in the given doc, calculating the ids from the heading # text and ensuring that there are no duplicated ids class HeaderIdGenerator def self.apply!(doc) self.new(doc) end def initialize(doc) @doc = doc @header_nodes = @doc.css("h2,h3,h4,h5,h6").to_a # { node -> id } hash @nodes_ids = @header_nodes.inject({}){ |hash, node| hash[node] = nil; hash } add_default_ids resolve_conflicts @nodes_ids.each{ |node, id| node['id'] = id } end private def resolve_conflicts return if conflicts(@nodes_ids).empty? # { h4-node -> [h3-node, h2-node] } @parents = @nodes_ids.keys.inject({}){ |hash, node| hash[node] = parent_header_nodes(node); hash } prepend_parents_on_conflicts append_numbers_on_conflicts end # Prepend parents recursively, one level at a time, until there are no conflicts or no changes in the result def prepend_parents_on_conflicts(parent_index = 0) original = @nodes_ids.dup conflicts(@nodes_ids).each do |node, id| if parent = @parents[node][parent_index] new_id = subheader_id("#{subheader_id(parent.content)} #{@nodes_ids[node]}") @nodes_ids[node] = new_id end end prepend_parents_on_conflicts(parent_index + 1) if original != @nodes_ids end # Parent != DOM nesting, but in the context of the content

...

def parent_header_nodes(node) parent_tags(node.name).map do |parent_tag| @header_nodes[0..@header_nodes.index(node)-1].select{ |sibling| sibling.name == parent_tag }.last end.compact end # "h4" -> ["h2", "h3"] def parent_tags(tag) level = tag.gsub('h','').to_i (2..level-1).map{ |n| "h#{n}" }.reverse end def append_numbers_on_conflicts conflicts(@nodes_ids).group_by{ |node, id| id }.each do |id, id_conflicts| id_conflicts.each_with_index do |conflict, n| node = conflict[0] new_id = "#{id}-#{n+1}" @nodes_ids[node] = new_id end end end def conflicts(hash) hash.select{ |node1, id1| hash.select{ |_node1, id2| id1 == id2 }.size > 1 } end def add_default_ids @nodes_ids.each{ |node, id| @nodes_ids[node] = subheader_id(node.content) } end def subheader_id(content) content.to_s.downcase.gsub(/\W+/, '-').gsub(/\A-+|-+\Z/, '') end end