lib/boilerpipe/sax/boilerpipe_html_parser.rb in boilerpipe-ruby-0.4.3 vs lib/boilerpipe/sax/boilerpipe_html_parser.rb in boilerpipe-ruby-0.4.4
- old
+ new
@@ -1,17 +1,12 @@
module Boilerpipe::SAX
class BoilerpipeHTMLParser
def self.parse(text)
- # script bug - delete script tags
- text.gsub!(/\<script>.+?<\/script>/i, '')
+ # strip out tags that cause issues
+ text = Preprocessor.strip(text)
- # nokogiri uses libxml for mri and nekohtml for jruby
- # mri doesn't remove when missing the semicolon
- text.gsub!(/( ) /, '\1; ')
-
# use nokogiri to fix any bad tags, errors - keep experimenting with this
text = Nokogiri::HTML(text).to_html
-
handler = HTMLContentHandler.new
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
noko_parser.parse(text)
handler.text_document
end