Sha256: b7e52516fdda7d2a027aa26093f42cd6e4fac2b1d1a02d9b10be77200274a9b2

Contents?: true

Size: 722 Bytes

Versions: 5

Compression:

Stored size: 722 Bytes

Contents

#  Marks all blocks as content that:
#  are on the same tag-level as very likely main content
#  (usually the level of the largest  block)
#  have a significant number of words, currently: at least 100
#  Used downstream of KeepLargestBlockFilter

module Boilerpipe::Filters
  class LargeBlockSameTagLevelToContentFilter

    def self.process(doc)

      largest = doc.text_blocks.find do |tb|
        tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
      end

      return doc if largest.nil?
      tag_level = largest.tag_level

      doc.text_blocks.each do |tb|
        next if tb.is_content?
        tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
      end

      doc
    end

  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.0 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.3.0 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.2.0 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.1.1 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
boilerpipe-ruby-0.1.0 lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb