lib/readability.rb in ruby-readability-0.5.2 vs lib/readability.rb in ruby-readability-0.5.3

- old
+ new

@@ -37,17 +37,22 @@ def prepare_candidates @html.css("script, style").each { |i| i.remove } remove_unlikely_candidates! if @remove_unlikely_candidates transform_misused_divs_into_paragraphs! - + @candidates = score_paragraphs(options[:min_text_length]) @best_candidate = select_best_candidate(@candidates) end def make_html @html = Nokogiri::HTML(@input, nil, @options[:encoding]) + # In case document has no body, such as from empty string or redirect + @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0 + + # Remove html comment tags + @html.xpath('//comment()').each { |i| i.remove } end def images(content=nil, reload=false) begin require 'mini_magick' @@ -172,22 +177,23 @@ append = true end end if append - sibling.name = "div" unless %w[div p].include?(sibling.name.downcase) - output << sibling + sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects + sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase) + output << sibling_dup end end output end def select_best_candidate(candidates) sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] } - debug("Top 5 canidates:") + debug("Top 5 candidates:") sorted_candidates[0...5].each do |candidate| debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}") end best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 } @@ -279,11 +285,11 @@ end def remove_unlikely_candidates! @html.css("*").each do |elem| str = "#{elem[:class]}#{elem[:id]}" - if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body' + if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body') debug("Removing unlikely candidate - #{str}") elem.remove end end end @@ -306,11 +312,11 @@ # end end end end - def sanitize(node, candidates, options = {}) + def sanitize(node, candidates, options = {}) node.css("h1, h2, h3, h4, h5, h6").each do |header| header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 end node.css("form, object, iframe, embed").each do |elem| @@ -406,8 +412,7 @@ el.remove end end end end - end end