Sha256: cbf5fc70c53416849f55b8d976da27c8e60f388eceed3e5d347cd453b7aba47a

Contents?: true

Size: 812 Bytes

Versions: 5

Compression:

Stored size: 812 Bytes

Contents

 # Marks all blocks as "non-content" that occur after blocks that have been
 # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
 # number of words in content blocks occur before this mark (default: 60).
 # This can be used in conjunction with an upstream TerminatingBlocksFinder.

module Boilerpipe::Filters
  class IgnoreBlocksAfterContentFilter < HeuristicFilterBase

    def self.process(doc, min_num_words=60)
      found_end_of_text = false
      num_words = 0

      doc.text_blocks.each do |tb|
        end_of_text = tb.has_label? :INDICATES_END_OF_TEXT
        num_words += num_full_text_words(tb) if tb.is_content?
        found_end_of_text = true if end_of_text && num_words >= min_num_words
        tb.content = false if found_end_of_text
      end

      doc
    end

  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.0 lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb
boilerpipe-ruby-0.3.0 lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb
boilerpipe-ruby-0.2.0 lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb
boilerpipe-ruby-0.1.1 lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb
boilerpipe-ruby-0.1.0 lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb