lib/readability.rb in ruby-readability-0.6.0 vs lib/readability.rb in ruby-readability-0.6.1
- old
+ new
@@ -13,11 +13,13 @@
:weight_classes => true,
:clean_conditionally => true,
:remove_empty_nodes => true,
:min_image_width => 130,
:min_image_height => 80,
- :ignore_image_format => []
+ :ignore_image_format => [],
+ :blacklist => nil,
+ :whitelist => nil
}.freeze
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
def initialize(input, options = {})
@@ -33,22 +35,46 @@
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
@best_candidate_has_image = true
make_html
+ handle_exclusions!(@options[:whitelist], @options[:blacklist])
end
def prepare_candidates
@html.css("script, style").each { |i| i.remove }
remove_unlikely_candidates! if @remove_unlikely_candidates
transform_misused_divs_into_paragraphs!
-
+
@candidates = score_paragraphs(options[:min_text_length])
@best_candidate = select_best_candidate(@candidates)
end
- def make_html
+ def handle_exclusions!(whitelist, blacklist)
+ return unless whitelist || blacklist
+
+ if blacklist
+ elems = @html.css(blacklist)
+ if elems
+ elems.each do |e|
+ e.remove
+ end
+ end
+ end
+
+ if whitelist
+ elems = @html.css(whitelist).to_s
+
+ if body = @html.at_css('body')
+ body.inner_html = elems
+ end
+ end
+
+ @input = @html.to_s
+ end
+
+ def make_html(whitelist=nil, blacklist=nil)
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
# In case document has no body, such as from empty string or redirect
@html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
# Remove html comment tags
@@ -76,20 +102,20 @@
next unless element["src"]
url = element["src"].value
height = element["height"].nil? ? 0 : element["height"].value.to_i
width = element["width"].nil? ? 0 : element["width"].value.to_i
-
+
if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
- image = get_image_size(url)
+ image = get_image_size(url)
next unless image
else
image = {:width => width, :height => height}
end
-
+
image[:format] = File.extname(url).gsub(".", "")
-
+
if tested_images.include?(url)
debug("Image was tested: #{url}")
next
end
@@ -103,18 +129,16 @@
(list_images.empty? and content != @html) ? images(@html, true) : list_images
end
def get_image_size(url)
- begin
- w, h = FastImage.size(url)
- raise "Couldn't get size." if w.nil? || h.nil?
- {:width => w, :height => h}
- rescue => e
- debug("Image error: #{e}")
- nil
- end
+ w, h = FastImage.size(url)
+ raise "Couldn't get size." if w.nil? || h.nil?
+ {:width => w, :height => h}
+ rescue => e
+ debug("Image error: #{e}")
+ nil
end
def image_meets_criteria?(image)
return false if options[:ignore_image_format].include?(image[:format].downcase)
image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
@@ -146,46 +170,38 @@
# Let's grab this author:
# <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
author_elements = @html.xpath('//meta[@name = "dc.creator"]')
unless author_elements.empty?
author_elements.each do |element|
- if element['content']
- return element['content'].strip
- end
+ return element['content'].strip if element['content']
end
end
# Now let's try to grab this
# <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
# <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
unless author_elements.empty?
author_elements.each do |element|
- if element.text
- return element.text.strip
- end
+ return element.text.strip if element.text
end
end
# Now let's try to grab this
# <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
# TODO: strip out the (rel)?
author_elements = @html.xpath('//a[@rel = "author"]')
unless author_elements.empty?
author_elements.each do |element|
- if element.text
- return element.text.strip
- end
+ return element.text.strip if element.text
end
end
author_elements = @html.xpath('//*[@id = "author"]')
unless author_elements.empty?
author_elements.each do |element|
- if element.text
- return element.text.strip
- end
+ return element.text.strip if element.text
end
end
end
def content(remove_unlikely_candidates = :default)
@@ -228,14 +244,14 @@
if sibling.name.downcase == "p"
link_density = get_link_density(sibling)
node_content = sibling.text
node_length = node_content.length
- if node_length > 80 && link_density < 0.25
- append = true
+ append = if node_length > 80 && link_density < 0.25
+ true
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
- append = true
+ true
end
end
if append
sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
@@ -300,44 +316,32 @@
def class_weight(e)
weight = 0
return weight unless @weight_classes
if e[:class] && e[:class] != ""
- if e[:class] =~ REGEXES[:negativeRe]
- weight -= 25
- end
-
- if e[:class] =~ REGEXES[:positiveRe]
- weight += 25
- end
+ weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
+ weight += 25 if e[:class] =~ REGEXES[:positiveRe]
end
if e[:id] && e[:id] != ""
- if e[:id] =~ REGEXES[:negativeRe]
- weight -= 25
- end
-
- if e[:id] =~ REGEXES[:positiveRe]
- weight += 25
- end
+ weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
+ weight += 25 if e[:id] =~ REGEXES[:positiveRe]
end
weight
end
+ ELEMENT_SCORES = {
+ 'div' => 5,
+ 'blockquote' => 3,
+ 'form' => -3,
+ 'th' => -5
+ }.freeze
+
def score_node(elem)
content_score = class_weight(elem)
- case elem.name.downcase
- when "div"
- content_score += 5
- when "blockquote"
- content_score += 3
- when "form"
- content_score -= 3
- when "th"
- content_score -= 5
- end
+ content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
{ :content_score => content_score, :elem => elem }
end
def debug(str)
puts str if options[:debug]
@@ -371,11 +375,11 @@
# end
end
end
end
- def sanitize(node, candidates, options = {})
+ def sanitize(node, candidates, options = {})
node.css("h1, h2, h3, h4, h5, h6").each do |header|
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
end
node.css("form, object, iframe, embed").each do |elem|
@@ -448,40 +452,37 @@
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
counts["li"] -= 100
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
link_density = get_link_density(el)
- to_remove = false
- reason = ""
- if counts["img"] > counts["p"]
- reason = "too many images"
- to_remove = true
- elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
- reason = "more <li>s than <p>s"
- to_remove = true
- elsif counts["input"] > (counts["p"] / 3).to_i
- reason = "less than 3x <p>s than <input>s"
- to_remove = true
- elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
- reason = "too short a content length without a single image"
- to_remove = true
- elsif weight < 25 && link_density > 0.2
- reason = "too many links for its weight (#{weight})"
- to_remove = true
- elsif weight >= 25 && link_density > 0.5
- reason = "too many links for its weight (#{weight})"
- to_remove = true
- elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
- reason = "<embed>s with too short a content length, or too many <embed>s"
- to_remove = true
- end
-
- if to_remove
+ reason = clean_conditionally_reason?(counts, content_length, options, weight, link_density)
+ if reason
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
el.remove
end
end
end
end
+
+ def clean_conditionally_reason?(counts, content_length, options, weight, link_density)
+ if counts["img"] > counts["p"]
+ "too many images"
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
+ "more <li>s than <p>s"
+ elsif counts["input"] > (counts["p"] / 3).to_i
+ "less than 3x <p>s than <input>s"
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
+ "too short a content length without a single image"
+ elsif weight < 25 && link_density > 0.2
+ "too many links for its weight (#{weight})"
+ elsif weight >= 25 && link_density > 0.5
+ "too many links for its weight (#{weight})"
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
+ "<embed>s with too short a content length, or too many <embed>s"
+ else
+ nil
+ end
+ end
+
end
end