Sha256: f6696a76de68a05b729971cdfcd82fd13f2c0d8b9699a53a3ab3893089e0571e
Contents?: true
Size: 1.24 KB
Versions: 4
Compression:
Stored size: 1.24 KB
Contents
# Marks all TextBlocks "content" which are between the headline and the part that has # already been marked content, if they are marked MIGHT_BE_CONTENT. # This filter is quite specific to the news domain. # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT module Boilerpipe::Filters class ExpandTitleToContentFilter def self.process(doc) tbs = doc.text_blocks # slower and more ruby-like # comeback and let's do some benchmarking # titles = tbs.select{ |tb| tb.has_label?(:TITLE) } # title = tbs.index(titles.last) # content_start = tbs.find_index(&:is_content?) i = 0 title = nil content_start = nil tbs.each do |tb| title = i if content_start.nil? && tb.has_label?(:TITLE) content_start = i if content_start.nil? && tb.is_content? i += 1 end return doc if no_title_with_subsequent_content?(content_start, title) tbs.slice(title...content_start).each do |tb| tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT) end doc end def self.no_title_with_subsequent_content?(content_start, title) title.nil? || content_start.nil? || content_start <= title end end end
Version data entries
4 entries across 4 versions & 1 rubygems