lib/readability.rb in ruby-readability-0.5.2 vs lib/readability.rb in ruby-readability-0.5.3
- old
+ new
@@ -37,17 +37,22 @@
def prepare_candidates
@html.css("script, style").each { |i| i.remove }
remove_unlikely_candidates! if @remove_unlikely_candidates
transform_misused_divs_into_paragraphs!
-
+
@candidates = score_paragraphs(options[:min_text_length])
@best_candidate = select_best_candidate(@candidates)
end
def make_html
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
+ # In case document has no body, such as from empty string or redirect
+ @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
+
+ # Remove html comment tags
+ @html.xpath('//comment()').each { |i| i.remove }
end
def images(content=nil, reload=false)
begin
require 'mini_magick'
@@ -172,22 +177,23 @@
append = true
end
end
if append
- sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
- output << sibling
+ sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
+ sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
+ output << sibling_dup
end
end
output
end
def select_best_candidate(candidates)
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
- debug("Top 5 canidates:")
+ debug("Top 5 candidates:")
sorted_candidates[0...5].each do |candidate|
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
end
best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
@@ -279,11 +285,11 @@
end
def remove_unlikely_candidates!
@html.css("*").each do |elem|
str = "#{elem[:class]}#{elem[:id]}"
- if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
debug("Removing unlikely candidate - #{str}")
elem.remove
end
end
end
@@ -306,11 +312,11 @@
# end
end
end
end
- def sanitize(node, candidates, options = {})
+ def sanitize(node, candidates, options = {})
node.css("h1, h2, h3, h4, h5, h6").each do |header|
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
end
node.css("form, object, iframe, embed").each do |elem|
@@ -406,8 +412,7 @@
el.remove
end
end
end
end
-
end
end