Sha256: 2cdfb060c200a8237f2c94e0ac2ad73e7d078a6e1907e5762ca18f65531ec6fa

Contents?: true

Size: 622 Bytes

Versions: 5

Compression:

Stored size: 622 Bytes

Contents

# Marks nested list-item blocks after the end of the main content as content.
#  Used downstream of keep_largest_block_filter.

module Boilerpipe::Filters
  class ListAtEndFilter
    MAX = 99999999

    def self.process(doc)
      tag_level = MAX

      doc.text_blocks.each do |tb|
        if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
          tag_level = tb.tag_level
        elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
          tb.content = true
        else
          tag_level = MAX
        end
      end

      doc
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/filters/list_at_end_filter.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/filters/list_at_end_filter.rb