Sha256: ad598bd6a007d532dd1df025ea608591e46c606cf728ff115d35fa9fae9d3ad4

Contents?: true

Size: 556 Bytes

Versions: 5

Compression:

Stored size: 556 Bytes

Contents

# Removes TextBlocks which have explicitly been marked as "not content".

module Boilerpipe::Filters
  class BoilerplateBlockFilter
    def initialize(label)
      @label_to_keep = label
    end
    INSTANCE_KEEP_TITLE = BoilerplateBlockFilter.new(:TITLE)

    def process(doc)
      combined = doc.text_blocks.delete_if do |tb|
        if tb.is_not_content? &&
           (@label_to_keep.nil? || !tb.has_label?(:TITLE))
          true
        else
          false
        end
      end
      doc.replace_text_blocks!(combined)
      doc
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/filters/boilerplate_block_filter.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/filters/boilerplate_block_filter.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/filters/boilerplate_block_filter.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/filters/boilerplate_block_filter.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/filters/boilerplate_block_filter.rb