Sha256: 1bf1344867d6ac45facc8140308f5b43f7e2b32844435f74ce4ff3bc0ecea6b7
Contents?: true
Size: 1.17 KB
Versions: 3
Compression:
Stored size: 1.17 KB
Contents
# frozen_string_literal: true require "nokogiri" module Uptriever # Splits HTML into smaller chunks by h2 headers class Chunker attr_reader :chunk def initialize(chunk) @chunk = chunk end def chunks doc = Nokogiri::HTML(chunk.fetch(:chunk_html)) header = doc.at_css("h1") return [chunk_dup] unless header # Root chunks are usually less specific, so make them weigh less root_chunk = chunk_dup.tap { _1[:weight] = 1.5 _1[:metadata] = {title: doc.at_css("h1").inner_text} } doc.xpath("//body").children.each_with_object([root_chunk]) do |child, acc| # Start new chunk if child.name == "h2" anchor = child.inner_text.downcase.gsub(/[^a-z0-9]/, "-") acc << chunk_dup.tap { _1.merge!( link: "#{_1.fetch(:link)}?id=#{anchor}", tracking_id: "#{_1.fetch(:tracking_id)}##{anchor}", metadata: {title: child.inner_text} ) } next acc end acc.last[:chunk_html] << child.to_xhtml end end private def chunk_dup = chunk.dup.tap { _1[:chunk_html] = +"" } end end
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
uptriever-0.1.1 | lib/uptriever/chunker.rb |
uptriever-0.1.0 | lib/uptriever/chunker.rb |
uptriever-0.0.1 | lib/uptriever/chunker.rb |