Sha256: d0f2efaa86b2725b54e1e3d9080b534488d2597af0be4f0e1f75a26d39bbec3e
Contents?: true
Size: 1003 Bytes
Versions: 1
Compression:
Stored size: 1003 Bytes
Contents
# Marks all TextBlocks "content" which are between the headline and the part that has # already been marked content, if they are marked MIGHT_BE_CONTENT. # This filter is quite specific to the news domain. # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT module Boilerpipe::Filters class ExpandTitleToContentFilter def self.process(doc) tbs = doc.text_blocks title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last title_idx = tbs.index(title) content_start = tbs.find_index(&:is_content?) return doc if no_title_with_subsequent_content?(content_start, title_idx) tbs.slice(title_idx...content_start) .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) } .each{ |tb| tb.content = true } doc end def self.no_title_with_subsequent_content?(content_start, title_idx) # title has to start before content title_idx.nil? || content_start.nil? || title_idx >= content_start end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
boilerpipe-ruby-0.5.0 | lib/boilerpipe/filters/expand_title_to_content_filter.rb |