lib/readability.rb in ruby-readability-0.7.0 vs lib/readability.rb in ruby-readability-0.7.1

- old
+ new

@@ -15,11 +15,13 @@ :remove_empty_nodes => true, :min_image_width => 130, :min_image_height => 80, :ignore_image_format => [], :blacklist => nil, - :whitelist => nil + :whitelist => nil, + :elements_to_score => ["p", "td", "pre"], + :likely_siblings => ["p"] }.freeze REGEXES = { :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, @@ -258,17 +260,18 @@ def get_article(candidates, best_candidate) # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max + downcased_likely_siblings = options[:likely_siblings].map(&:downcase) output = Nokogiri::XML::Node.new('div', @html) best_candidate[:elem].parent.children.each do |sibling| append = false append = true if sibling == best_candidate[:elem] append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold - if sibling.name.downcase == "p" + if downcased_likely_siblings.include?(sibling.name.downcase) link_density = get_link_density(sibling) node_content = sibling.text node_length = node_content.length append = if node_length > 80 && link_density < 0.25 @@ -308,11 +311,11 @@ link_length / text_length.to_f end def score_paragraphs(min_text_length) candidates = {} - @html.css("p,td").each do |elem| + @html.css(options[:elements_to_score].join(',')).each do |elem| parent_node = elem.parent grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil inner_text = elem.text # If this paragraph is less than 25 characters, don't even count it. @@ -421,10 +424,12 @@ # Conditionally clean <table>s, <ul>s, and <div>s clean_conditionally(node, candidates, "table, ul, div") # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] + all_whitelisted = base_whitelist.include?("*") + # We'll add whitespace instead of block elements, # so a<br>b will have a nice space between them base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] # Use a hash for speed (don't want to make a million calls to include?) @@ -433,10 +438,10 @@ replace_with_whitespace = Hash.new base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true } ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes - if whitelist[el.node_name] + if all_whitelisted || whitelist[el.node_name] el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } # Otherwise, replace the element with its contents else # If element is root, replace the node as a text node