readability.rb in ruby-readability-0.6.1

- old
+ new

@@ -13,11 +13,13 @@
       :weight_classes             => true,
       :clean_conditionally        => true,
       :remove_empty_nodes         => true,
       :min_image_width            => 130,
       :min_image_height           => 80,
-      :ignore_image_format        => []
+      :ignore_image_format        => [],
+      :blacklist                  => nil,
+      :whitelist                  => nil
     }.freeze
 
     attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
 
     def initialize(input, options = {})
@@ -33,22 +35,46 @@
       @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
       @weight_classes = @options[:weight_classes]
       @clean_conditionally = @options[:clean_conditionally]
       @best_candidate_has_image = true
       make_html
+      handle_exclusions!(@options[:whitelist], @options[:blacklist])
     end
 
     def prepare_candidates
       @html.css("script, style").each { |i| i.remove }
       remove_unlikely_candidates! if @remove_unlikely_candidates
       transform_misused_divs_into_paragraphs!
-      
+
       @candidates     = score_paragraphs(options[:min_text_length])
       @best_candidate = select_best_candidate(@candidates)
     end
 
-    def make_html
+    def handle_exclusions!(whitelist, blacklist)
+      return unless whitelist || blacklist
+
+      if blacklist
+        elems = @html.css(blacklist)
+        if elems
+          elems.each do |e|
+            e.remove
+          end
+        end
+      end
+
+      if whitelist
+        elems = @html.css(whitelist).to_s
+
+        if body = @html.at_css('body')
+          body.inner_html = elems
+        end
+      end
+
+      @input = @html.to_s
+    end
+
+    def make_html(whitelist=nil, blacklist=nil)
       @html = Nokogiri::HTML(@input, nil, @options[:encoding])
       # In case document has no body, such as from empty string or redirect
       @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
 
       # Remove html comment tags
@@ -76,20 +102,20 @@
           next unless element["src"]
 
           url     = element["src"].value
           height  = element["height"].nil?  ? 0 : element["height"].value.to_i
           width   = element["width"].nil?   ? 0 : element["width"].value.to_i
-          
+
           if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
-            image   = get_image_size(url) 
+            image   = get_image_size(url)
             next unless image
           else
             image = {:width => width, :height => height}
           end
-          
+
           image[:format] = File.extname(url).gsub(".", "")
-          
+
           if tested_images.include?(url)
             debug("Image was tested: #{url}")
             next
           end
 
@@ -103,18 +129,16 @@
 
       (list_images.empty? and content != @html) ? images(@html, true) : list_images
     end
 
     def get_image_size(url)
-      begin
-        w, h = FastImage.size(url)
-        raise "Couldn't get size." if w.nil? || h.nil?
-        {:width => w, :height => h}
-      rescue => e
-        debug("Image error: #{e}")
-        nil
-      end
+      w, h = FastImage.size(url)
+      raise "Couldn't get size." if w.nil? || h.nil?
+      {:width => w, :height => h}
+    rescue => e
+      debug("Image error: #{e}")
+      nil
     end
 
     def image_meets_criteria?(image)
       return false if options[:ignore_image_format].include?(image[:format].downcase)
       image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
@@ -146,46 +170,38 @@
       # Let's grab this author:
       # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
       author_elements = @html.xpath('//meta[@name = "dc.creator"]')
       unless author_elements.empty?
         author_elements.each do |element|
-          if element['content']
-            return element['content'].strip
-          end
+          return element['content'].strip if element['content']
         end
       end
 
       # Now let's try to grab this
       # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
       # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
       author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
       unless author_elements.empty?
         author_elements.each do |element|
-          if element.text
-            return element.text.strip
-          end
+          return element.text.strip if element.text
         end
       end
 
       # Now let's try to grab this
       # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
       # TODO: strip out the (rel)?
       author_elements = @html.xpath('//a[@rel = "author"]')
       unless author_elements.empty?
         author_elements.each do |element|
-          if element.text
-            return element.text.strip
-          end
+          return element.text.strip if element.text
         end
       end
 
       author_elements = @html.xpath('//*[@id = "author"]')
       unless author_elements.empty?
         author_elements.each do |element|
-          if element.text
-            return element.text.strip
-          end
+          return element.text.strip if element.text
         end
       end
     end
 
     def content(remove_unlikely_candidates = :default)
@@ -228,14 +244,14 @@
         if sibling.name.downcase == "p"
           link_density = get_link_density(sibling)
           node_content = sibling.text
           node_length = node_content.length
 
-          if node_length > 80 && link_density < 0.25
-            append = true
+          append = if node_length > 80 && link_density < 0.25
+            true
           elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
-            append = true
+            true
           end
         end
 
         if append
           sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
@@ -300,44 +316,32 @@
     def class_weight(e)
       weight = 0
       return weight unless @weight_classes
 
       if e[:class] && e[:class] != ""
-        if e[:class] =~ REGEXES[:negativeRe]
-          weight -= 25
-        end
-
-        if e[:class] =~ REGEXES[:positiveRe]
-          weight += 25
-        end
+        weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
+        weight += 25 if e[:class] =~ REGEXES[:positiveRe]
       end
 
       if e[:id] && e[:id] != ""
-        if e[:id] =~ REGEXES[:negativeRe]
-          weight -= 25
-        end
-
-        if e[:id] =~ REGEXES[:positiveRe]
-          weight += 25
-        end
+        weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
+        weight += 25 if e[:id] =~ REGEXES[:positiveRe]
       end
 
       weight
     end
 
+    ELEMENT_SCORES = {
+      'div' => 5,
+      'blockquote' => 3,
+      'form' => -3,
+      'th' => -5
+    }.freeze
+
     def score_node(elem)
       content_score = class_weight(elem)
-      case elem.name.downcase
-        when "div"
-          content_score += 5
-        when "blockquote"
-          content_score += 3
-        when "form"
-          content_score -= 3
-        when "th"
-          content_score -= 5
-      end
+      content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
       { :content_score => content_score, :elem => elem }
     end
 
     def debug(str)
       puts str if options[:debug]
@@ -371,11 +375,11 @@
 #          end
         end
       end
     end
 
-    def sanitize(node, candidates, options = {})    
+    def sanitize(node, candidates, options = {})
       node.css("h1, h2, h3, h4, h5, h6").each do |header|
         header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
       end
 
       node.css("form, object, iframe, embed").each do |elem|
@@ -448,40 +452,37 @@
           counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
           counts["li"] -= 100
 
           content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
           link_density = get_link_density(el)
-          to_remove = false
-          reason = ""
 
-          if counts["img"] > counts["p"]
-            reason = "too many images"
-            to_remove = true
-          elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
-            reason = "more <li>s than <p>s"
-            to_remove = true
-          elsif counts["input"] > (counts["p"] / 3).to_i
-            reason = "less than 3x <p>s than <input>s"
-            to_remove = true
-          elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
-            reason = "too short a content length without a single image"
-            to_remove = true
-          elsif weight < 25 && link_density > 0.2
-            reason = "too many links for its weight (#{weight})"
-            to_remove = true
-          elsif weight >= 25 && link_density > 0.5
-            reason = "too many links for its weight (#{weight})"
-            to_remove = true
-          elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
-            reason = "<embed>s with too short a content length, or too many <embed>s"
-            to_remove = true
-          end
-
-          if to_remove
+          reason = clean_conditionally_reason?(counts, content_length, options, weight, link_density)
+          if reason
             debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
             el.remove
           end
         end
       end
     end
+
+    def clean_conditionally_reason?(counts, content_length, options, weight, link_density)
+      if counts["img"] > counts["p"]
+        "too many images"
+      elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
+        "more <li>s than <p>s"
+      elsif counts["input"] > (counts["p"] / 3).to_i
+        "less than 3x <p>s than <input>s"
+      elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
+        "too short a content length without a single image"
+      elsif weight < 25 && link_density > 0.2
+        "too many links for its weight (#{weight})"
+      elsif weight >= 25 && link_density > 0.5
+        "too many links for its weight (#{weight})"
+      elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
+        "<embed>s with too short a content length, or too many <embed>s"
+      else
+        nil
+      end
+    end
+
   end
 end