Sha256: 0ad670d4cb84fc7950193a35c3a7386527a8ff8316b4820c856334d0a4462d7d

Contents?: true

Size: 1.92 KB

Versions: 2

Compression:

Stored size: 1.92 KB

Contents

module Boilerpipe::Extractors
  class ArticleExtractor
    def self.text(contents)
      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
      ::Boilerpipe::Extractors::ArticleExtractor.process(doc)
    end

    def self.process(doc)
      title = doc.title

      filters = ::Boilerpipe::Filters

      # marks text blocks as end of text with :INDICATES_END_OF_TEXT
      filters::TerminatingBlocksFinder.process doc

      # marks text blocks as title
      filters::DocumentTitleMatchClassifier.new(title).process doc

      # marks text blocks as content / non-content using boilerpipe alg
      filters::NumWordsRulesClassifier.process doc

      # marks text blocks after INDICATES_END_OF_TEXT non-content
      filters::IgnoreBlocksAfterContentFilter.process doc

      # marks HEADING text blocks as non-content after existing content
      filters::TrailingHeadlineToBoilerplateFilter.process doc

      # merge text blocks next to each other
      filters::BlockProximityFusion::MAX_DISTANCE_1.process doc

      # removes non-content text blocks
      filters::BoilerplateBlockFilter::INSTANCE_KEEP_TITLE.process doc

      # merge text blocks next to each other if they are the same tag level
      filters::BlockProximityFusion::MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process doc

      # Keeps only the largest text block as content
      filters::KeepLargestBlockFilter::INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process doc

      # Marks all TextBlocks "content" which are between the headline and the part is already content
      filters::ExpandTitleToContentFilter.process doc

      # mark text blocks with a lot of text at the same tag level as the largest current content as additional content
      filters::LargeBlockSameTagLevelToContentFilter.process doc

      # Marks nested list-item blocks after the end of the main content as content.
      filters::ListAtEndFilter.process doc

      doc.content
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.1.1 lib/boilerpipe/extractors/article_extractor.rb
boilerpipe-ruby-0.1.0 lib/boilerpipe/extractors/article_extractor.rb