Sha256: 9e19fb2f0645c1ead54ec4fa15939fa4d8849ff4b01e07c0ffe2f0033074a1e3

Contents?: true

Size: 1.93 KB

Versions: 8

Compression:

Stored size: 1.93 KB

Contents

module Boilerpipe::Extractors
  class ArticleExtractor
    def self.text(contents)
      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
      ::Boilerpipe::Extractors::ArticleExtractor.process(doc)
      doc.content
    end

    def self.process(doc)
      title = doc.title

      filters = ::Boilerpipe::Filters

      # marks text blocks as end of text with :INDICATES_END_OF_TEXT
      filters::TerminatingBlocksFinder.process doc

      # marks text blocks as title
      filters::DocumentTitleMatchClassifier.new(title).process doc

      # marks text blocks as content / non-content using boilerpipe alg
      filters::NumWordsRulesClassifier.process doc

      # marks text blocks after INDICATES_END_OF_TEXT non-content
      filters::IgnoreBlocksAfterContentFilter.process doc

      # marks HEADING text blocks as non-content after existing content
      filters::TrailingHeadlineToBoilerplateFilter.process doc

      # merge text blocks next to each other
      filters::BlockProximityFusion::MAX_DISTANCE_1.process doc

      # removes non-content text blocks
      filters::BoilerplateBlockFilter::INSTANCE_KEEP_TITLE.process doc

      # merge text blocks next to each other if they are the same tag level
      filters::BlockProximityFusion::MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process doc

      # Keeps only the largest text block as content
      filters::KeepLargestBlockFilter::INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process doc

      # Marks all TextBlocks "content" which are between the headline and the part is already content
      filters::ExpandTitleToContentFilter.process doc

      # mark text blocks with a lot of text at the same tag level as the largest current content as additional content
      filters::LargeBlockSameTagLevelToContentFilter.process doc

      # Marks nested list-item blocks after the end of the main content as content.
      filters::ListAtEndFilter.process doc

      doc
    end
  end
end

Version data entries

8 entries across 8 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.4.0 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.3.0 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.2.0 lib/boilerpipe/extractors/article_extractor.rb