Sha256: d0f2efaa86b2725b54e1e3d9080b534488d2597af0be4f0e1f75a26d39bbec3e

Contents?: true

Size: 1003 Bytes

Versions: 1

Compression:

Stored size: 1003 Bytes

Contents

# Marks all TextBlocks "content" which are between the headline and the part that has
# already been marked content, if they are marked MIGHT_BE_CONTENT.
# This filter is quite specific to the news domain.
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT

module Boilerpipe::Filters
  class ExpandTitleToContentFilter
    def self.process(doc)
      tbs = doc.text_blocks

      title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
      title_idx = tbs.index(title)

      content_start = tbs.find_index(&:is_content?)

      return doc if no_title_with_subsequent_content?(content_start, title_idx)

      tbs.slice(title_idx...content_start)
        .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
        .each{ |tb| tb.content = true }

      doc
    end

    def self.no_title_with_subsequent_content?(content_start, title_idx)
      # title has to start before content
      title_idx.nil? || content_start.nil? || title_idx >= content_start
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/filters/expand_title_to_content_filter.rb