# encoding: utf-8
require 'rubygems'
require 'nokogiri'
require 'guess_html_encoding'
module DQReadability
class Document
DEFAULT_OPTIONS = {
:retry_length => 250,
:min_text_length => 25,
:remove_unlikely_candidates => true,
:weight_classes => true,
:clean_conditionally => true,
:remove_empty_nodes => true,
:min_image_width => 130,
:min_image_height => 80,
:ignore_image_format => []
}.freeze
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
:replaceBrsRe => /(
]*>[ \n\r\t]*){2,}/i,
:replaceFontsRe => /<(\/?)font[^>]*>/i,
:trimRe => /^\s+|\s+$/,
:normalizeRe => /\s{2,}/,
:killBreaksRe => /(
(\s| ?)*){1,}/,
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
def initialize(input, options = {})
@options = DEFAULT_OPTIONS.merge(options)
@input = input
if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
@input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
@options[:encoding] = @input.encoding.to_s
end
@input = @input.gsub(REGEXES[:replaceBrsRe], '
').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] @clean_conditionally = @options[:clean_conditionally] @best_candidate_has_image = true make_html end def prepare_candidates @html.css("script, style").each { |i| i.remove } remove_unlikely_candidates! if @remove_unlikely_candidates transform_misused_divs_into_paragraphs! @candidates = score_paragraphs(options[:min_text_length]) @best_candidate = select_best_candidate(@candidates) end def make_html @html = Nokogiri::HTML(@input, nil, @options[:encoding]) # In case document has no body, such as from empty string or redirect @html = Nokogiri::HTML('