lib/readability.rb in ruby-readability-0.5.0 vs lib/readability.rb in ruby-readability-0.5.1

- old
+ new

@@ -3,19 +3,22 @@ require 'guess_html_encoding' module Readability class Document DEFAULT_OPTIONS = { - :retry_length => 250, - :min_text_length => 25, + :retry_length => 250, + :min_text_length => 25, :remove_unlikely_candidates => true, - :weight_classes => true, - :clean_conditionally => true, - :remove_empty_nodes => true + :weight_classes => true, + :clean_conditionally => true, + :remove_empty_nodes => true, + :min_image_width => 130, + :min_image_height => 80, + :ignore_image_format => [] }.freeze - attr_accessor :options, :html + attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image def initialize(input, options = {}) @options = DEFAULT_OPTIONS.merge(options) @input = input @@ -26,17 +29,84 @@ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] @clean_conditionally = @options[:clean_conditionally] + @best_candidate_has_image = true make_html end + def prepare_candidates + @html.css("script, style").each { |i| i.remove } + remove_unlikely_candidates! if @remove_unlikely_candidates + transform_misused_divs_into_paragraphs! + + @candidates = score_paragraphs(options[:min_text_length]) + @best_candidate = select_best_candidate(@candidates) + end + def make_html @html = Nokogiri::HTML(@input, nil, @options[:encoding]) end + def images(content=nil, reload=false) + begin + require 'mini_magick' + rescue LoadError + raise "Please install mini_magick in order to use the #images feature." + end + + @best_candidate_has_image = false if reload + + prepare_candidates + list_images = [] + tested_images = [] + content = @best_candidate[:elem] unless reload + + return list_images if content.nil? + elements = content.css("img").map(&:attributes) + + elements.each do |element| + url = element["src"].value + height = element["height"].nil? ? 0 : element["height"].value.to_i + width = element["width"].nil? ? 0 : element["width"].value.to_i + format = File.extname(url).gsub(".", "") + image = {:width => width, :height => height, :format => format} + image = load_image(url) if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?) + + next unless image + + if tested_images.include?(url) + debug("Image was tested: #{url}") + next + end + + tested_images.push(url) + if image_meets_criteria?(image) + list_images << url + else + debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}") + end + end + + (list_images.empty? and content != @html) ? images(@html, true) : list_images + end + + def load_image(url) + begin + MiniMagick::Image.open(url) + rescue => e + debug("Image error: #{e}") + nil + end + end + + def image_meets_criteria?(image) + return false if options[:ignore_image_format].include?(image[:format].downcase) + image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0) + end + REGEXES = { :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, @@ -55,18 +125,13 @@ end def content(remove_unlikely_candidates = :default) @remove_unlikely_candidates = false if remove_unlikely_candidates == false - @html.css("script, style").each(&:remove) + prepare_candidates + article = get_article(@candidates, @best_candidate) - remove_unlikely_candidates! if @remove_unlikely_candidates - transform_misused_divs_into_paragraphs! - candidates = score_paragraphs(options[:min_text_length]) - best_candidate = select_best_candidate(candidates) - article = get_article(candidates, best_candidate) - - cleaned_article = sanitize(article, candidates, options) + cleaned_article = sanitize(article, @candidates, options) if article.text.strip.length < options[:retry_length] if @remove_unlikely_candidates @remove_unlikely_candidates = false elsif @weight_classes @weight_classes = false