readability.rb in ruby-readability-0.3.0.pre

- old
+ new

@@ -1,31 +1,40 @@
 require 'rubygems'
 require 'nokogiri'
+require 'guess_html_encoding'
 
 module Readability
   class Document
     DEFAULT_OPTIONS = {
       :retry_length => 250,
       :min_text_length => 25,
       :remove_unlikely_candidates => true,
       :weight_classes => true,
-      :clean_conditionally => true
+      :clean_conditionally => true,
+      :remove_empty_nodes => true
     }.freeze
 
     attr_accessor :options, :html
 
     def initialize(input, options = {})
-      @input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
       @options = DEFAULT_OPTIONS.merge(options)
+      @input = input
+
+      if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
+        @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
+        @options[:encoding] = @input.encoding.to_s
+      end
+
+      @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
       @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
       @weight_classes = @options[:weight_classes]
       @clean_conditionally = @options[:clean_conditionally]
       make_html
     end
 
     def make_html
-      @html = Nokogiri::HTML(@input, nil, 'UTF-8')
+      @html = Nokogiri::HTML(@input, nil, @options[:encoding])
     end
 
     REGEXES = {
         :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
         :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -219,11 +228,11 @@
           end
         else
           # wrap text nodes in p tags
 #          elem.children.each do |child|
 #            if child.text?
-##              debug("wrapping text node with a p")
+#              debug("wrapping text node with a p")
 #              child.swap("<p>#{child.text}</p>")
 #            end
 #          end
         end
       end
@@ -236,13 +245,15 @@
 
       node.css("form, object, iframe, embed").each do |elem|
         elem.remove
       end
 
-      # remove empty <p> tags
-      node.css("p").each do |elem|
-        elem.remove if elem.content.strip.empty?
+      if @options[:remove_empty_nodes]
+        # remove <p> tags that have no text content - this will also remove p tags that contain only images.
+        node.css("p").each do |elem|
+          elem.remove if elem.content.strip.empty?
+        end
       end
 
       # Conditionally clean <table>s, <ul>s, and <div>s
       clean_conditionally(node, candidates, "table, ul, div")
 
@@ -257,21 +268,19 @@
       base_whitelist.each {|tag| whitelist[tag] = true }
       replace_with_whitespace = Hash.new
       base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
 
       ([node] + node.css("*")).each do |el|
-
         # If element is in whitelist, delete all its attributes
         if whitelist[el.node_name]
           el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
 
           # Otherwise, replace the element with its contents
         else
           if replace_with_whitespace[el.node_name]
-            # Adding &nbsp; here, because swap removes regular spaaces
-            el.swap('&nbsp;' << el.text << '&nbsp;')
+            el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
           else
-            el.swap(el.text)
+            el.swap(Nokogiri::XML::Text.new(el.text, el.document))
           end
         end
 
       end