boilerpipe_html_parser.rb in boilerpipe-ruby-0.4.4

- old
+ new

@@ -1,17 +1,12 @@
 module Boilerpipe::SAX
   class BoilerpipeHTMLParser
     def self.parse(text)
-      # script bug - delete script tags
-      text.gsub!(/\<script>.+?<\/script>/i, '')
+      # strip out tags that cause issues
+      text = Preprocessor.strip(text)
 
-      # nokogiri uses libxml for mri and nekohtml for jruby
-      # mri doesn't remove &nbsp; when missing the semicolon
-      text.gsub!(/(&nbsp) /, '\1; ')
-
       # use nokogiri to fix any bad tags, errors - keep experimenting with this
       text = Nokogiri::HTML(text).to_html
-
       handler = HTMLContentHandler.new
       noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
       noko_parser.parse(text)
       handler.text_document
     end