require 'rubygems' require 'nokogiri' require 'guess_html_encoding' module Readability class Document DEFAULT_OPTIONS = { :retry_length => 250, :min_text_length => 25, :remove_unlikely_candidates => true, :weight_classes => true, :clean_conditionally => true, :remove_empty_nodes => true, :min_image_width => 130, :min_image_height => 80, :ignore_image_format => [] }.freeze attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image def initialize(input, options = {}) @options = DEFAULT_OPTIONS.merge(options) @input = input if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding] @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding] @options[:encoding] = @input.encoding.to_s end @input = @input.gsub(REGEXES[:replaceBrsRe], '
').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] @clean_conditionally = @options[:clean_conditionally] @best_candidate_has_image = true make_html end def prepare_candidates @html.css("script, style").each { |i| i.remove } remove_unlikely_candidates! if @remove_unlikely_candidates transform_misused_divs_into_paragraphs! @candidates = score_paragraphs(options[:min_text_length]) @best_candidate = select_best_candidate(@candidates) end def make_html @html = Nokogiri::HTML(@input, nil, @options[:encoding]) # In case document has no body, such as from empty string or redirect @html = Nokogiri::HTML('