require 'rubygems'
require 'nokogiri'
require 'cgi'
# so to run with non-Rails projects
class Object
def try(method, *arguments)
send(method, *arguments) if respond_to? method
end
end
module Readability
class Document
TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250
attr_accessor :document, :base_uri, :request, :options, :best_candidate
def initialize(document, base_uri, request, options = {})
@document = document
@base_uri = base_uri
@request = request
@options = options
end
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
:okMaybeItsACandidateRe => /and|article|body|column|main/i,
:positiveRe => /article|body|content|entry|hentry|page|pagination|post|text/i,
:negativeRe => /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i,
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
:replaceBrsRe => /( ]*>[ \n\r\t]*){2,}/i,
:replaceFontsRe => /<(\/?)font[^>]*>/i,
:trimRe => /^\s+|\s+$/,
:normalizeRe => /\s{2,}/,
:killBreaksRe => /( (\s| ?)*){1,}/,
:videoRe => /http:\/\/(www\.)?(youtube|vimeo|ted|player\.vimeo)\.com/i
}
# should we get rid of this?
def make_html
@document.encoding = 'UTF-8'
@best_candidate = nil
end
def has_special_rule?
!!rules[@base_uri]
end
def content(remove_unlikely_candidates = true)
debug "Starting the content heuristic"
@document.css("script, style").each {|el| el.remove }
@document.search('//comment()').each {|el| el.remove }
article = youtube if is_youtube? && remove_unlikely_candidates
article = vimeo if is_vimeo? && remove_unlikely_candidates
article = ted if is_ted? && remove_unlikely_candidates
article = slideshare if is_slideshare? && remove_unlikely_candidates
article = google_videos if is_google_videos? && remove_unlikely_candidates
article = apply_custom_rule if has_special_rule?
if article && remove_unlikely_candidates
content = article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
if rules[@base_uri].try(:encoding) && rules[@base_uri].encoding == 'ISO-8859-1'
return convert_to_utf8(content)
end
return content
else
remove_unlikely_candidates! if remove_unlikely_candidates
transform_misused_divs_into_paragraphs!
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
best_candidate = select_best_candidate(candidates)
article = get_article(candidates, best_candidate)
cleaned_article = sanitize(article, candidates, options)
if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
make_html
content(false)
else
cleaned_article
end
end
end
def is_youtube?
(@base_uri.to_s =~ /^(www\.)?youtube.com/)
end
def is_vimeo?
(@base_uri.to_s =~ /^(www.)?vimeo.com/)
end
def is_ted?
(@base_uri.to_s =~ /^(www.)?ted.com\/talks/)
end
def is_slideshare?
(@base_uri.to_s =~ /^(www.)?slideshare.net/)
end
def is_special_case?
(@base_uri.to_s =~ REGEXES[:videoRe])
end
def is_google_videos?
(@base_uri.to_s =~ /video.google.com/)
end
def google_videos
uri = URI.parse(@base_uri + @request)
video_id = CGI::parse(uri.query)['docid'].first
Nokogiri::HTML.fragment <<-HTML
HTML
end
def slideshare
title = @document.css("h1.h-slideshow-title").inner_html
movie_value = @document.css("link[name='media_presentation']").first.attributes["href"].value
Nokogiri::HTML.fragment <<-HTML
HTML
end
def youtube
debug("I have a Youtube video page")
if @request =~ /\?v=([_\-a-z0-9]+)&?/i
Nokogiri::HTML.fragment <<-HTML
HTML
else
nil
end
end
def vimeo
debug("I have a Vimeo video page")
# matches non-channel or pages that used swfobject to print player
if @document.css("#clip_id")
Nokogiri::HTML.fragment("")
# matches channel pages
elsif player = @document.css(".player")
html = ""
player.each do |video|
if video.to_html =~ /clip_id=([0-9]+)/
html << ""
end
end
Nokogiri::HTML.fragment(html)
else
nil
end
end
def ted
debug("I have a TED video page")
if (player = @document.css(".copy_paste")).present?
unless player.first.attr("value").blank?
Nokogiri::HTML.fragment(player.first.attr("value").to_s)
else
nil
end
else
nil
end
end
def get_article(candidates, best_candidate)
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
output = Nokogiri::XML::Node.new('div', @document)
begin
if best_candidate[:elem].try(:parent)
best_candidate[:elem].parent.try(:children).each do |sibling|
append = false
append = true if sibling == best_candidate[:elem]
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
if sibling.name.downcase == "p"
link_density = get_link_density(sibling)
node_content = sibling.text
node_length = node_content.length
if node_length > 80 && link_density < 0.25
append = true
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
append = true
end
end
if append
sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
output << sibling
end
end
end
end
output
end
def select_best_candidate(candidates)
@best_candidate ||= begin
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
debug("Top 5 candidates:")
sorted_candidates[0...5].each do |candidate|
debug("Candidate #{candidate[:elem].try(:name)}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
end
best_candidate = sorted_candidates.first || { :elem => @document.css("body").first, :content_score => 0 }
#debug("Best candidate #{best_candidate[:elem].andand.name} with score #{best_candidate[:content_score]}")
best_candidate
end
end
def get_link_density(elem)
link_length = elem.css("a").map {|i| i.text}.join("").length
text_length = elem.text.length
link_length / text_length.to_f
end
def score_paragraphs(min_text_length)
candidates = {}
@document.css("p,td").each do |elem|
parent_node = elem.parent
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
inner_text = elem.text
# If this paragraph is less than 25 characters, don't even count it.
next if inner_text.length < min_text_length
candidates[parent_node] ||= score_node(parent_node)
candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
content_score = 1
content_score += inner_text.split(',').length
content_score += [(inner_text.length / 100).to_i, 3].min
candidates[parent_node][:content_score] += content_score
candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
end
# Scale the final candidates score based on link density. Good content should have a
# relatively small link density (5% or less) and be mostly unaffected by this operation.
candidates.each do |elem, candidate|
candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
end
candidates
end
def class_weight(e)
weight = 0
if e[:class] && e[:class] != ""
if e[:class] =~ REGEXES[:negativeRe]
weight -= 25
end
if e[:class] =~ REGEXES[:positiveRe]
weight += 25
end
end
if e[:id] && e[:id] != ""
if e[:id] =~ REGEXES[:negativeRe]
weight -= 25
end
if e[:id] =~ REGEXES[:positiveRe]
weight += 25
end
end
weight
end
def convert_to_utf8(string)
string.unpack("C*").pack("U*")
end
def score_node(elem)
content_score = class_weight(elem)
case elem.name.downcase
when "div":
content_score += 5
when "blockquote":
content_score += 3
when "form":
content_score -= 3
when "th":
content_score -= 5
end
{ :content_score => content_score, :elem => elem }
end
def debug(str)
puts "READABILITY : "+ str if options[:debug]
end
def remove_unlikely_candidates!
@document.css("*").each do |elem|
str = "#{elem[:class]}#{elem[:id]}"
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
debug("Removing unlikely candidate - #{str}")
elem.remove
end
end
end
def transform_misused_divs_into_paragraphs!
@document.css("*").each do |elem|
if elem.name.downcase == "div"
# transform
s that do not contain other block elements into
s
if elem.inner_html !~ REGEXES[:divToPElementsRe]
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
elem.name = "p"
end
end
end
end
def sanitize(node, candidates, options = {})
node.css("h1, h2, h3, h4, h5, h6").each do |header|
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
end
node.css("form").each do |elem|
elem.remove
end
node.css("iframe").each do |iframe|
unless iframe.attr("src").to_s =~ REGEXES[:videoRe]
iframe.remove
end
end
# remove empty
tags
# node.css("p").each do |elem|
# elem.remove if elem.content.strip.empty?
# end
# Conditionally clean
s,
s, and
s
node.css("table, ul, div").each do |el|
weight = class_weight(el)
content_score = candidates[el] ? candidates[el][:content_score] : 0
name = el.name.downcase
if weight + content_score < 0
el.remove
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
elsif el.text.count(",") < 10
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
counts["li"] -= 100
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
link_density = get_link_density(el)
to_remove = false
reason = ""
if (counts["img"] > counts["p"]) && (counts["p"] > 0)
reason = "too many images #{counts['p']}"
to_remove = true
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
reason = "more