Sha256: a19c9768be580663237bd02d2555a3f275ff74f1b83cb05588a2c90f7eafb7c1

Contents?: true

Size: 625 Bytes

Versions: 5

Compression:

Stored size: 625 Bytes

Contents

# Marks nested list-item blocks after the end of the main content as content.
#  Used downstream of keep_largest_block_filter.

module Boilerpipe::Filters
  class ListAtEndFilter
    MAX = 99999999

    def self.process(doc)
      tag_level = MAX

      doc.text_blocks.each do |tb|
        if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
          tag_level = tb.tag_level
        elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
          tb.content = true
        else
          tag_level = MAX
        end
      end

      doc
    end

  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.0 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.3.0 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.2.0 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.1.1 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.1.0 lib/boilerpipe/filters/list_at_end_filter.rb