Sha256: fe8d8095be594fb522061a2d448e64c770a1397e15dd0a752d4cc7952af74ed7

Contents?: true

Size: 1.49 KB

Versions: 5

Compression:

Stored size: 1.49 KB

Contents

# encoding: utf-8

# Finds blocks which are potentially indicating the end of an article
# text and marks them with INDICATES_END_OF_TEXT. This can be used
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.


module Boilerpipe::Filters
  class TerminatingBlocksFinder
    def self.process(doc)
      doc.text_blocks.each do |tb|
        next unless tb.num_words < 15
        if tb.text.length >= 8 && finds_match?(tb.text.downcase)
          tb.labels << :INDICATES_END_OF_TEXT
        elsif tb.link_density == 1.0 && tb.text == 'comment'
          tb.labels << :INDICATES_END_OF_TEXT
        end
      end

      doc
    end

    def self.finds_match?(text)
      text.start_with?('comments') ||
        text =~ /^\d+ (comments|users responded in)/ || # starts with number
        text.start_with?('© reuters') ||
        text.start_with?('please rate this') ||
        text.start_with?('post a comment') ||
        text.include?('what you think...') ||
        text.include?('add your comment') ||
        text.include?('add comment') ||
        #TODO add this and test
        #text.include?('leave a reply') ||
        #text.include?('leave a comment') ||
        #text.include?('show comments') ||
        #text.include?('Share this:') ||
        text.include?('reader views') ||
        text.include?('have your say') ||
        text.include?('reader comments') ||
        text.include?('rätta artikeln') ||
        text == 'thanks for your comments - this feedback is now closed'
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.0 lib/boilerpipe/filters/terminating_blocks_finder.rb
boilerpipe-ruby-0.3.0 lib/boilerpipe/filters/terminating_blocks_finder.rb
boilerpipe-ruby-0.2.0 lib/boilerpipe/filters/terminating_blocks_finder.rb
boilerpipe-ruby-0.1.1 lib/boilerpipe/filters/terminating_blocks_finder.rb
boilerpipe-ruby-0.1.0 lib/boilerpipe/filters/terminating_blocks_finder.rb