lib/readability.rb in ruby-readability-0.7.1 vs lib/readability.rb in ruby-readability-0.7.2
- old
+ new
@@ -17,13 +17,14 @@
:min_image_height => 80,
:ignore_image_format => [],
:blacklist => nil,
:whitelist => nil,
:elements_to_score => ["p", "td", "pre"],
- :likely_siblings => ["p"]
+ :likely_siblings => ["p"],
+ :ignore_redundant_nesting => false
}.freeze
-
+
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
@@ -33,11 +34,11 @@
:trimRe => /^\s+|\s+$/,
:normalizeRe => /\s{2,}/,
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}
-
+
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
def initialize(input, options = {})
@options = DEFAULT_OPTIONS.merge(options)
@input = input
@@ -48,11 +49,11 @@
end
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
- @clean_conditionally = @options[:clean_conditionally]
+ @clean_conditionally = !!@options[:clean_conditionally]
@best_candidate_has_image = true
make_html
handle_exclusions!(@options[:whitelist], @options[:blacklist])
end
@@ -143,15 +144,15 @@
end
end
(list_images.empty? and content != @html) ? images(@html, true) : list_images
end
-
+
def images_with_fqdn_uris!(source_uri)
images_with_fqdn_uris(@html, source_uri)
end
-
+
def images_with_fqdn_uris(document = @html.dup, source_uri)
uri = URI.parse(source_uri)
host = uri.host
scheme = uri.scheme
port = uri.port # defaults to 80
@@ -159,11 +160,11 @@
base = "#{scheme}://#{host}:#{port}/"
images = []
document.css("img").each do |elem|
begin
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
images << elem['src'].to_s
rescue URI::InvalidURIError => exc
elem.remove
end
end
@@ -262,18 +263,29 @@
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
output = Nokogiri::XML::Node.new('div', @html)
- best_candidate[:elem].parent.children.each do |sibling|
+
+ # If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
+ # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
+ # related content detection, but could lead to false positives. Not supported in arc90's readability.
+ node =
+ if options[:ignore_redundant_nesting]
+ closest_node_with_siblings(best_candidate[:elem])
+ else
+ best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
+ end
+
+ node.parent.children.each do |sibling|
append = false
- append = true if sibling == best_candidate[:elem]
+ append = true if sibling == node
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
if downcased_likely_siblings.include?(sibling.name.downcase)
link_density = get_link_density(sibling)
- node_content = sibling.text
+ node_content = sibling.text.strip
node_length = node_content.length
append = if node_length > 80 && link_density < 0.25
true
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
@@ -289,10 +301,27 @@
end
output
end
+ def closest_node_with_siblings(element)
+ node = element
+
+ until node.node_name == 'body'
+ siblings = node.parent.children
+ non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
+
+ if non_empty.size > 1
+ return node
+ else
+ node = node.parent
+ end
+ end
+
+ node
+ end
+
def select_best_candidate(candidates)
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
debug("Top 5 candidates:")
sorted_candidates[0...5].each do |candidate|
@@ -370,11 +399,15 @@
content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
{ :content_score => content_score, :elem => elem }
end
def debug(str)
- puts str if options[:debug]
+ if options[:debug].respond_to?(:call)
+ options[:debug].call(str)
+ elsif options[:debug]
+ puts str
+ end
end
def remove_unlikely_candidates!
@html.css("*").each do |elem|
str = "#{elem[:class]}#{elem[:id]}"
@@ -424,11 +457,12 @@
# Conditionally clean <table>s, <ul>s, and <div>s
clean_conditionally(node, candidates, "table, ul, div")
# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
- all_whitelisted = base_whitelist.include?("*")
+ all_tags_whitelisted = base_whitelist.include?("*")
+ all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
# We'll add whitespace instead of block elements,
# so a<br>b will have a nice space between them
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
@@ -438,12 +472,12 @@
replace_with_whitespace = Hash.new
base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
- if all_whitelisted || whitelist[el.node_name]
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+ if all_tags_whitelisted || whitelist[el.node_name]
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
# Otherwise, replace the element with its contents
else
# If element is root, replace the node as a text node
if el.parent.nil?
@@ -468,32 +502,45 @@
return html.gsub(/[\r\n\f]+/, "\n" )
end
def clean_conditionally(node, candidates, selector)
return unless @clean_conditionally
+
node.css(selector).each do |el|
weight = class_weight(el)
content_score = candidates[el] ? candidates[el][:content_score] : 0
name = el.name.downcase
-
+ remove = false
+ message = nil
+
if weight + content_score < 0
- el.remove
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+ remove = true
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
elsif el.text.count(",") < 10
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
counts["li"] -= 100
# For every img under a noscript tag discount one from the count to avoid double counting
counts["img"] -= el.css("noscript").css("img").length
-
+
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
link_density = get_link_density(el)
reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
if reason
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
- el.remove
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
+ remove = true
end
+ end
+
+ if options[:clean_conditionally].respond_to?(:call)
+ context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
+ remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
+ end
+
+ if remove
+ debug(message || "Conditionally cleaned by user-specified function.")
+ el.remove
end
end
end
def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)