lib/readability.rb in ruby-readability-0.7.1 vs lib/readability.rb in ruby-readability-0.7.2

- old
+ new

@@ -17,13 +17,14 @@ :min_image_height => 80, :ignore_image_format => [], :blacklist => nil, :whitelist => nil, :elements_to_score => ["p", "td", "pre"], - :likely_siblings => ["p"] + :likely_siblings => ["p"], + :ignore_redundant_nesting => false }.freeze - + REGEXES = { :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, @@ -33,11 +34,11 @@ :trimRe => /^\s+|\s+$/, :normalizeRe => /\s{2,}/, :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/, :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i } - + attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image def initialize(input, options = {}) @options = DEFAULT_OPTIONS.merge(options) @input = input @@ -48,11 +49,11 @@ end @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] - @clean_conditionally = @options[:clean_conditionally] + @clean_conditionally = !!@options[:clean_conditionally] @best_candidate_has_image = true make_html handle_exclusions!(@options[:whitelist], @options[:blacklist]) end @@ -143,15 +144,15 @@ end end (list_images.empty? and content != @html) ? images(@html, true) : list_images end - + def images_with_fqdn_uris!(source_uri) images_with_fqdn_uris(@html, source_uri) end - + def images_with_fqdn_uris(document = @html.dup, source_uri) uri = URI.parse(source_uri) host = uri.host scheme = uri.scheme port = uri.port # defaults to 80 @@ -159,11 +160,11 @@ base = "#{scheme}://#{host}:#{port}/" images = [] document.css("img").each do |elem| begin - elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil + elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil images << elem['src'].to_s rescue URI::InvalidURIError => exc elem.remove end end @@ -262,18 +263,29 @@ # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max downcased_likely_siblings = options[:likely_siblings].map(&:downcase) output = Nokogiri::XML::Node.new('div', @html) - best_candidate[:elem].parent.children.each do |sibling| + + # If the best candidate is the only element in its parent then we will never find any siblings. Therefore, + # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the + # related content detection, but could lead to false positives. Not supported in arc90's readability. + node = + if options[:ignore_redundant_nesting] + closest_node_with_siblings(best_candidate[:elem]) + else + best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability. + end + + node.parent.children.each do |sibling| append = false - append = true if sibling == best_candidate[:elem] + append = true if sibling == node append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold if downcased_likely_siblings.include?(sibling.name.downcase) link_density = get_link_density(sibling) - node_content = sibling.text + node_content = sibling.text.strip node_length = node_content.length append = if node_length > 80 && link_density < 0.25 true elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/ @@ -289,10 +301,27 @@ end output end + def closest_node_with_siblings(element) + node = element + + until node.node_name == 'body' + siblings = node.parent.children + non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? } + + if non_empty.size > 1 + return node + else + node = node.parent + end + end + + node + end + def select_best_candidate(candidates) sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] } debug("Top 5 candidates:") sorted_candidates[0...5].each do |candidate| @@ -370,11 +399,15 @@ content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0) { :content_score => content_score, :elem => elem } end def debug(str) - puts str if options[:debug] + if options[:debug].respond_to?(:call) + options[:debug].call(str) + elsif options[:debug] + puts str + end end def remove_unlikely_candidates! @html.css("*").each do |elem| str = "#{elem[:class]}#{elem[:id]}" @@ -424,11 +457,12 @@ # Conditionally clean <table>s, <ul>s, and <div>s clean_conditionally(node, candidates, "table, ul, div") # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] - all_whitelisted = base_whitelist.include?("*") + all_tags_whitelisted = base_whitelist.include?("*") + all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*") # We'll add whitespace instead of block elements, # so a<br>b will have a nice space between them base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] @@ -438,12 +472,12 @@ replace_with_whitespace = Hash.new base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true } ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes - if all_whitelisted || whitelist[el.node_name] - el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } + if all_tags_whitelisted || whitelist[el.node_name] + el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted # Otherwise, replace the element with its contents else # If element is root, replace the node as a text node if el.parent.nil? @@ -468,32 +502,45 @@ return html.gsub(/[\r\n\f]+/, "\n" ) end def clean_conditionally(node, candidates, selector) return unless @clean_conditionally + node.css(selector).each do |el| weight = class_weight(el) content_score = candidates[el] ? candidates[el][:content_score] : 0 name = el.name.downcase - + remove = false + message = nil + if weight + content_score < 0 - el.remove - debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") + remove = true + message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero." elsif el.text.count(",") < 10 counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } counts["li"] -= 100 # For every img under a noscript tag discount one from the count to avoid double counting counts["img"] -= el.css("noscript").css("img").length - + content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) if reason - debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") - el.remove + message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}." + remove = true end + end + + if options[:clean_conditionally].respond_to?(:call) + context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el } + remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element. + end + + if remove + debug(message || "Conditionally cleaned by user-specified function.") + el.remove end end end def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)