Sha256: 7615db2456eea0d62021c4db7081bb3488dd1337ee3d328dc92248f31a042026

Contents?: true

Size: 720 Bytes

Versions: 5

Compression:

Stored size: 720 Bytes

Contents

#  Marks all blocks as content that:
#  are on the same tag-level as very likely main content
#  (usually the level of the largest  block)
#  have a significant number of words, currently: at least 100
#  Used downstream of KeepLargestBlockFilter

module Boilerpipe::Filters
  class LargeBlockSameTagLevelToContentFilter
    def self.process(doc)
      largest = doc.text_blocks.find do |tb|
        tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
      end

      return doc if largest.nil?

      tag_level = largest.tag_level

      doc.text_blocks.each do |tb|
        next if tb.is_content?

        tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
      end

      doc
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb