readability.rb in ruby-readability-0.7.2

- old
+ new

@@ -17,13 +17,14 @@
       :min_image_height           => 80,
       :ignore_image_format        => [],
       :blacklist                  => nil,
       :whitelist                  => nil,
       :elements_to_score          => ["p", "td", "pre"],
-      :likely_siblings            => ["p"]
+      :likely_siblings            => ["p"],
+      :ignore_redundant_nesting   => false
     }.freeze
-    
+
     REGEXES = {
         :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
         :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
         :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
         :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
@@ -33,11 +34,11 @@
         :trimRe => /^\s+|\s+$/,
         :normalizeRe => /\s{2,}/,
         :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
         :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
     }
-    
+
     attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
 
     def initialize(input, options = {})
       @options = DEFAULT_OPTIONS.merge(options)
       @input = input
@@ -48,11 +49,11 @@
       end
 
       @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
       @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
       @weight_classes = @options[:weight_classes]
-      @clean_conditionally = @options[:clean_conditionally]
+      @clean_conditionally = !!@options[:clean_conditionally]
       @best_candidate_has_image = true
       make_html
       handle_exclusions!(@options[:whitelist], @options[:blacklist])
     end
 
@@ -143,15 +144,15 @@
           end
         end
 
       (list_images.empty? and content != @html) ? images(@html, true) : list_images
     end
-    
+
     def images_with_fqdn_uris!(source_uri)
       images_with_fqdn_uris(@html, source_uri)
     end
-    
+
     def images_with_fqdn_uris(document = @html.dup, source_uri)
       uri = URI.parse(source_uri)
       host = uri.host
       scheme = uri.scheme
       port = uri.port # defaults to 80
@@ -159,11 +160,11 @@
       base = "#{scheme}://#{host}:#{port}/"
 
       images = []
       document.css("img").each do |elem|
         begin
-          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil 
+          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
           images << elem['src'].to_s
         rescue URI::InvalidURIError => exc
           elem.remove
         end
       end
@@ -262,18 +263,29 @@
       # Things like preambles, content split by ads that we removed, etc.
 
       sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
       downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
       output = Nokogiri::XML::Node.new('div', @html)
-      best_candidate[:elem].parent.children.each do |sibling|
+
+      # If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
+      # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
+      # related content detection, but could lead to false positives. Not supported in arc90's readability.
+      node =
+        if options[:ignore_redundant_nesting]
+          closest_node_with_siblings(best_candidate[:elem])
+        else
+          best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
+        end
+
+      node.parent.children.each do |sibling|
         append = false
-        append = true if sibling == best_candidate[:elem]
+        append = true if sibling == node
         append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
 
         if downcased_likely_siblings.include?(sibling.name.downcase)
           link_density = get_link_density(sibling)
-          node_content = sibling.text
+          node_content = sibling.text.strip
           node_length = node_content.length
 
           append = if node_length > 80 && link_density < 0.25
             true
           elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
@@ -289,10 +301,27 @@
       end
 
       output
     end
 
+    def closest_node_with_siblings(element)
+      node = element
+
+      until node.node_name == 'body'
+        siblings = node.parent.children
+        non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
+
+        if non_empty.size > 1
+          return node
+        else
+          node = node.parent
+        end
+      end
+
+      node
+    end
+
     def select_best_candidate(candidates)
       sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
 
       debug("Top 5 candidates:")
       sorted_candidates[0...5].each do |candidate|
@@ -370,11 +399,15 @@
       content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
       { :content_score => content_score, :elem => elem }
     end
 
     def debug(str)
-      puts str if options[:debug]
+      if options[:debug].respond_to?(:call)
+        options[:debug].call(str)
+      elsif options[:debug]
+        puts str
+      end
     end
 
     def remove_unlikely_candidates!
       @html.css("*").each do |elem|
         str = "#{elem[:class]}#{elem[:id]}"
@@ -424,11 +457,12 @@
       # Conditionally clean <table>s, <ul>s, and <div>s
       clean_conditionally(node, candidates, "table, ul, div")
 
       # We'll sanitize all elements using a whitelist
       base_whitelist = @options[:tags] || %w[div p]
-      all_whitelisted = base_whitelist.include?("*")
+      all_tags_whitelisted = base_whitelist.include?("*")
+      all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
 
       # We'll add whitespace instead of block elements,
       # so a<br>b will have a nice space between them
       base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
 
@@ -438,12 +472,12 @@
       replace_with_whitespace = Hash.new
       base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
 
       ([node] + node.css("*")).each do |el|
         # If element is in whitelist, delete all its attributes
-        if all_whitelisted || whitelist[el.node_name]
-          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+        if all_tags_whitelisted || whitelist[el.node_name]
+          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
 
           # Otherwise, replace the element with its contents
         else
           # If element is root, replace the node as a text node
           if el.parent.nil?
@@ -468,32 +502,45 @@
       return html.gsub(/[\r\n\f]+/, "\n" )
     end
 
     def clean_conditionally(node, candidates, selector)
       return unless @clean_conditionally
+
       node.css(selector).each do |el|
         weight = class_weight(el)
         content_score = candidates[el] ? candidates[el][:content_score] : 0
         name = el.name.downcase
-        
+        remove = false
+        message = nil
+
         if weight + content_score < 0
-          el.remove
-          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+          remove = true
+          message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
         elsif el.text.count(",") < 10
           counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
           counts["li"] -= 100
 
           # For every img under a noscript tag discount one from the count to avoid double counting
           counts["img"] -= el.css("noscript").css("img").length
-                
+
           content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
           link_density = get_link_density(el)
 
           reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
           if reason
-            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
-            el.remove
+            message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
+            remove = true
           end
+        end
+
+        if options[:clean_conditionally].respond_to?(:call)
+          context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
+          remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
+        end
+
+        if remove
+          debug(message || "Conditionally cleaned by user-specified function.")
+          el.remove
         end
       end
     end
 
     def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)