lib/boilerpipe/extractors/article_extractor.rb in boilerpipe-ruby-0.1.1 vs lib/boilerpipe/extractors/article_extractor.rb in boilerpipe-ruby-0.2.0
- old
+ new
@@ -1,10 +1,11 @@
module Boilerpipe::Extractors
class ArticleExtractor
def self.text(contents)
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
::Boilerpipe::Extractors::ArticleExtractor.process(doc)
+ doc.content
end
def self.process(doc)
title = doc.title
@@ -44,9 +45,9 @@
filters::LargeBlockSameTagLevelToContentFilter.process doc
# Marks nested list-item blocks after the end of the main content as content.
filters::ListAtEndFilter.process doc
- doc.content
+ doc
end
end
end