lib/readability.rb in ruby-readability-0.7.0 vs lib/readability.rb in ruby-readability-0.7.1
- old
+ new
@@ -15,11 +15,13 @@
:remove_empty_nodes => true,
:min_image_width => 130,
:min_image_height => 80,
:ignore_image_format => [],
:blacklist => nil,
- :whitelist => nil
+ :whitelist => nil,
+ :elements_to_score => ["p", "td", "pre"],
+ :likely_siblings => ["p"]
}.freeze
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -258,17 +260,18 @@
def get_article(candidates, best_candidate)
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
+ downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
output = Nokogiri::XML::Node.new('div', @html)
best_candidate[:elem].parent.children.each do |sibling|
append = false
append = true if sibling == best_candidate[:elem]
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
- if sibling.name.downcase == "p"
+ if downcased_likely_siblings.include?(sibling.name.downcase)
link_density = get_link_density(sibling)
node_content = sibling.text
node_length = node_content.length
append = if node_length > 80 && link_density < 0.25
@@ -308,11 +311,11 @@
link_length / text_length.to_f
end
def score_paragraphs(min_text_length)
candidates = {}
- @html.css("p,td").each do |elem|
+ @html.css(options[:elements_to_score].join(',')).each do |elem|
parent_node = elem.parent
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
inner_text = elem.text
# If this paragraph is less than 25 characters, don't even count it.
@@ -421,10 +424,12 @@
# Conditionally clean <table>s, <ul>s, and <div>s
clean_conditionally(node, candidates, "table, ul, div")
# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
+ all_whitelisted = base_whitelist.include?("*")
+
# We'll add whitespace instead of block elements,
# so a<br>b will have a nice space between them
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
# Use a hash for speed (don't want to make a million calls to include?)
@@ -433,10 +438,10 @@
replace_with_whitespace = Hash.new
base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
- if whitelist[el.node_name]
+ if all_whitelisted || whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
# Otherwise, replace the element with its contents
else
# If element is root, replace the node as a text node