lib/boilerpipe/sax/boilerpipe_html_parser.rb in boilerpipe-ruby-0.3.0 vs lib/boilerpipe/sax/boilerpipe_html_parser.rb in boilerpipe-ruby-0.4.0

- old
+ new

@@ -2,14 +2,14 @@ module Boilerpipe::SAX class BoilerpipeHTMLParser def self.parse(text) #script bug - delete script tags - text = text.gsub(/\<script>.+?<\/script>/i, '') + text.gsub!(/\<script>.+?<\/script>/i, '') # nokogiri uses libxml for mri and nekohtml for jruby # mri doesn't remove &nbsp; when missing the semicolon - text = text.gsub(/(&nbsp) /, '\1; ') + text.gsub!(/(&nbsp) /, '\1; ') # use nokogiri to fix any bad tags, errors - keep experimenting with this text = Nokogiri::HTML(text).to_html