lib/boilerpipe/sax/boilerpipe_html_parser.rb in boilerpipe-ruby-0.3.0 vs lib/boilerpipe/sax/boilerpipe_html_parser.rb in boilerpipe-ruby-0.4.0
- old
+ new
@@ -2,14 +2,14 @@
module Boilerpipe::SAX
class BoilerpipeHTMLParser
def self.parse(text)
#script bug - delete script tags
- text = text.gsub(/\<script>.+?<\/script>/i, '')
+ text.gsub!(/\<script>.+?<\/script>/i, '')
# nokogiri uses libxml for mri and nekohtml for jruby
# mri doesn't remove when missing the semicolon
- text = text.gsub(/( ) /, '\1; ')
+ text.gsub!(/( ) /, '\1; ')
# use nokogiri to fix any bad tags, errors - keep experimenting with this
text = Nokogiri::HTML(text).to_html