Sha256: cd9ae89bef3c8387c01fecb91c51894b023b7124d67a5a7fb4716f05fe5ee829

Contents?: true

Size: 737 Bytes

Versions: 5

Compression:

Stored size: 737 Bytes

Contents

# A full-text extractor which extracts the largest text component of a page.
# For news articles, it may perform better than the DefaultExtractor, but
# usually worse than ArticleExtractor.

module Boilerpipe::Extractors
  class KeepEverythingWithKMinWordsExtractor
    def self.text(min, contents)
      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
      ::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
      doc.content
    end

    def self.process(min, doc)
      ::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
      ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
      ::Boilerpipe::Filters::MinWordsFilter.process min, doc
      doc
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb