lib/readability.rb in ruby-readability-0.6.0 vs lib/readability.rb in ruby-readability-0.6.1

- old
+ new

@@ -13,11 +13,13 @@ :weight_classes => true, :clean_conditionally => true, :remove_empty_nodes => true, :min_image_width => 130, :min_image_height => 80, - :ignore_image_format => [] + :ignore_image_format => [], + :blacklist => nil, + :whitelist => nil }.freeze attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image def initialize(input, options = {}) @@ -33,22 +35,46 @@ @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] @clean_conditionally = @options[:clean_conditionally] @best_candidate_has_image = true make_html + handle_exclusions!(@options[:whitelist], @options[:blacklist]) end def prepare_candidates @html.css("script, style").each { |i| i.remove } remove_unlikely_candidates! if @remove_unlikely_candidates transform_misused_divs_into_paragraphs! - + @candidates = score_paragraphs(options[:min_text_length]) @best_candidate = select_best_candidate(@candidates) end - def make_html + def handle_exclusions!(whitelist, blacklist) + return unless whitelist || blacklist + + if blacklist + elems = @html.css(blacklist) + if elems + elems.each do |e| + e.remove + end + end + end + + if whitelist + elems = @html.css(whitelist).to_s + + if body = @html.at_css('body') + body.inner_html = elems + end + end + + @input = @html.to_s + end + + def make_html(whitelist=nil, blacklist=nil) @html = Nokogiri::HTML(@input, nil, @options[:encoding]) # In case document has no body, such as from empty string or redirect @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0 # Remove html comment tags @@ -76,20 +102,20 @@ next unless element["src"] url = element["src"].value height = element["height"].nil? ? 0 : element["height"].value.to_i width = element["width"].nil? ? 0 : element["width"].value.to_i - + if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?) - image = get_image_size(url) + image = get_image_size(url) next unless image else image = {:width => width, :height => height} end - + image[:format] = File.extname(url).gsub(".", "") - + if tested_images.include?(url) debug("Image was tested: #{url}") next end @@ -103,18 +129,16 @@ (list_images.empty? and content != @html) ? images(@html, true) : list_images end def get_image_size(url) - begin - w, h = FastImage.size(url) - raise "Couldn't get size." if w.nil? || h.nil? - {:width => w, :height => h} - rescue => e - debug("Image error: #{e}") - nil - end + w, h = FastImage.size(url) + raise "Couldn't get size." if w.nil? || h.nil? + {:width => w, :height => h} + rescue => e + debug("Image error: #{e}") + nil end def image_meets_criteria?(image) return false if options[:ignore_image_format].include?(image[:format].downcase) image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0) @@ -146,46 +170,38 @@ # Let's grab this author: # <meta name="dc.creator" content="Finch - http://www.getfinch.com" /> author_elements = @html.xpath('//meta[@name = "dc.creator"]') unless author_elements.empty? author_elements.each do |element| - if element['content'] - return element['content'].strip - end + return element['content'].strip if element['content'] end end # Now let's try to grab this # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span> # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div> author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]') unless author_elements.empty? author_elements.each do |element| - if element.text - return element.text.strip - end + return element.text.strip if element.text end end # Now let's try to grab this # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a> # TODO: strip out the (rel)? author_elements = @html.xpath('//a[@rel = "author"]') unless author_elements.empty? author_elements.each do |element| - if element.text - return element.text.strip - end + return element.text.strip if element.text end end author_elements = @html.xpath('//*[@id = "author"]') unless author_elements.empty? author_elements.each do |element| - if element.text - return element.text.strip - end + return element.text.strip if element.text end end end def content(remove_unlikely_candidates = :default) @@ -228,14 +244,14 @@ if sibling.name.downcase == "p" link_density = get_link_density(sibling) node_content = sibling.text node_length = node_content.length - if node_length > 80 && link_density < 0.25 - append = true + append = if node_length > 80 && link_density < 0.25 + true elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/ - append = true + true end end if append sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects @@ -300,44 +316,32 @@ def class_weight(e) weight = 0 return weight unless @weight_classes if e[:class] && e[:class] != "" - if e[:class] =~ REGEXES[:negativeRe] - weight -= 25 - end - - if e[:class] =~ REGEXES[:positiveRe] - weight += 25 - end + weight -= 25 if e[:class] =~ REGEXES[:negativeRe] + weight += 25 if e[:class] =~ REGEXES[:positiveRe] end if e[:id] && e[:id] != "" - if e[:id] =~ REGEXES[:negativeRe] - weight -= 25 - end - - if e[:id] =~ REGEXES[:positiveRe] - weight += 25 - end + weight -= 25 if e[:id] =~ REGEXES[:negativeRe] + weight += 25 if e[:id] =~ REGEXES[:positiveRe] end weight end + ELEMENT_SCORES = { + 'div' => 5, + 'blockquote' => 3, + 'form' => -3, + 'th' => -5 + }.freeze + def score_node(elem) content_score = class_weight(elem) - case elem.name.downcase - when "div" - content_score += 5 - when "blockquote" - content_score += 3 - when "form" - content_score -= 3 - when "th" - content_score -= 5 - end + content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0) { :content_score => content_score, :elem => elem } end def debug(str) puts str if options[:debug] @@ -371,11 +375,11 @@ # end end end end - def sanitize(node, candidates, options = {}) + def sanitize(node, candidates, options = {}) node.css("h1, h2, h3, h4, h5, h6").each do |header| header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 end node.css("form, object, iframe, embed").each do |elem| @@ -448,40 +452,37 @@ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } counts["li"] -= 100 content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) - to_remove = false - reason = "" - if counts["img"] > counts["p"] - reason = "too many images" - to_remove = true - elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" - reason = "more <li>s than <p>s" - to_remove = true - elsif counts["input"] > (counts["p"] / 3).to_i - reason = "less than 3x <p>s than <input>s" - to_remove = true - elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2) - reason = "too short a content length without a single image" - to_remove = true - elsif weight < 25 && link_density > 0.2 - reason = "too many links for its weight (#{weight})" - to_remove = true - elsif weight >= 25 && link_density > 0.5 - reason = "too many links for its weight (#{weight})" - to_remove = true - elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 - reason = "<embed>s with too short a content length, or too many <embed>s" - to_remove = true - end - - if to_remove + reason = clean_conditionally_reason?(counts, content_length, options, weight, link_density) + if reason debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") el.remove end end end end + + def clean_conditionally_reason?(counts, content_length, options, weight, link_density) + if counts["img"] > counts["p"] + "too many images" + elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" + "more <li>s than <p>s" + elsif counts["input"] > (counts["p"] / 3).to_i + "less than 3x <p>s than <input>s" + elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2) + "too short a content length without a single image" + elsif weight < 25 && link_density > 0.2 + "too many links for its weight (#{weight})" + elsif weight >= 25 && link_density > 0.5 + "too many links for its weight (#{weight})" + elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 + "<embed>s with too short a content length, or too many <embed>s" + else + nil + end + end + end end