Sha256: f6696a76de68a05b729971cdfcd82fd13f2c0d8b9699a53a3ab3893089e0571e

Contents?: true

Size: 1.24 KB

Versions: 4

Compression:

Stored size: 1.24 KB

Contents

# Marks all TextBlocks "content" which are between the headline and the part that has
# already been marked content, if they are marked MIGHT_BE_CONTENT.
# This filter is quite specific to the news domain.
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT

module Boilerpipe::Filters
  class ExpandTitleToContentFilter
    def self.process(doc)
      tbs = doc.text_blocks

      #     slower and more ruby-like
      #     comeback and let's do some benchmarking
      #     titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
      #     title = tbs.index(titles.last)
      #     content_start = tbs.find_index(&:is_content?)

      i = 0
      title = nil
      content_start = nil

      tbs.each do |tb|
        title = i if content_start.nil? && tb.has_label?(:TITLE)
        content_start = i if content_start.nil? && tb.is_content?
        i += 1
      end

      return doc if no_title_with_subsequent_content?(content_start, title)

      tbs.slice(title...content_start).each do |tb|
        tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
      end

      doc
    end

    def self.no_title_with_subsequent_content?(content_start, title)
      title.nil? || content_start.nil? || content_start <= title
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.4 lib/boilerpipe/filters/expand_title_to_content_filter.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/filters/expand_title_to_content_filter.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/filters/expand_title_to_content_filter.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/filters/expand_title_to_content_filter.rb