lib/readability.rb in ruby-readability-0.5.0 vs lib/readability.rb in ruby-readability-0.5.1
- old
+ new
@@ -3,19 +3,22 @@
require 'guess_html_encoding'
module Readability
class Document
DEFAULT_OPTIONS = {
- :retry_length => 250,
- :min_text_length => 25,
+ :retry_length => 250,
+ :min_text_length => 25,
:remove_unlikely_candidates => true,
- :weight_classes => true,
- :clean_conditionally => true,
- :remove_empty_nodes => true
+ :weight_classes => true,
+ :clean_conditionally => true,
+ :remove_empty_nodes => true,
+ :min_image_width => 130,
+ :min_image_height => 80,
+ :ignore_image_format => []
}.freeze
- attr_accessor :options, :html
+ attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
def initialize(input, options = {})
@options = DEFAULT_OPTIONS.merge(options)
@input = input
@@ -26,17 +29,84 @@
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
+ @best_candidate_has_image = true
make_html
end
+ def prepare_candidates
+ @html.css("script, style").each { |i| i.remove }
+ remove_unlikely_candidates! if @remove_unlikely_candidates
+ transform_misused_divs_into_paragraphs!
+
+ @candidates = score_paragraphs(options[:min_text_length])
+ @best_candidate = select_best_candidate(@candidates)
+ end
+
def make_html
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
end
+ def images(content=nil, reload=false)
+ begin
+ require 'mini_magick'
+ rescue LoadError
+ raise "Please install mini_magick in order to use the #images feature."
+ end
+
+ @best_candidate_has_image = false if reload
+
+ prepare_candidates
+ list_images = []
+ tested_images = []
+ content = @best_candidate[:elem] unless reload
+
+ return list_images if content.nil?
+ elements = content.css("img").map(&:attributes)
+
+ elements.each do |element|
+ url = element["src"].value
+ height = element["height"].nil? ? 0 : element["height"].value.to_i
+ width = element["width"].nil? ? 0 : element["width"].value.to_i
+ format = File.extname(url).gsub(".", "")
+ image = {:width => width, :height => height, :format => format}
+ image = load_image(url) if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
+
+ next unless image
+
+ if tested_images.include?(url)
+ debug("Image was tested: #{url}")
+ next
+ end
+
+ tested_images.push(url)
+ if image_meets_criteria?(image)
+ list_images << url
+ else
+ debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
+ end
+ end
+
+ (list_images.empty? and content != @html) ? images(@html, true) : list_images
+ end
+
+ def load_image(url)
+ begin
+ MiniMagick::Image.open(url)
+ rescue => e
+ debug("Image error: #{e}")
+ nil
+ end
+ end
+
+ def image_meets_criteria?(image)
+ return false if options[:ignore_image_format].include?(image[:format].downcase)
+ image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
+ end
+
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
@@ -55,18 +125,13 @@
end
def content(remove_unlikely_candidates = :default)
@remove_unlikely_candidates = false if remove_unlikely_candidates == false
- @html.css("script, style").each(&:remove)
+ prepare_candidates
+ article = get_article(@candidates, @best_candidate)
- remove_unlikely_candidates! if @remove_unlikely_candidates
- transform_misused_divs_into_paragraphs!
- candidates = score_paragraphs(options[:min_text_length])
- best_candidate = select_best_candidate(candidates)
- article = get_article(candidates, best_candidate)
-
- cleaned_article = sanitize(article, candidates, options)
+ cleaned_article = sanitize(article, @candidates, options)
if article.text.strip.length < options[:retry_length]
if @remove_unlikely_candidates
@remove_unlikely_candidates = false
elsif @weight_classes
@weight_classes = false