lib/readability.rb in ruby-readability-0.2.3 vs lib/readability.rb in ruby-readability-0.3.0.pre
- old
+ new
@@ -1,31 +1,40 @@
require 'rubygems'
require 'nokogiri'
+require 'guess_html_encoding'
module Readability
class Document
DEFAULT_OPTIONS = {
:retry_length => 250,
:min_text_length => 25,
:remove_unlikely_candidates => true,
:weight_classes => true,
- :clean_conditionally => true
+ :clean_conditionally => true,
+ :remove_empty_nodes => true
}.freeze
attr_accessor :options, :html
def initialize(input, options = {})
- @input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@options = DEFAULT_OPTIONS.merge(options)
+ @input = input
+
+ if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
+ @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
+ @options[:encoding] = @input.encoding.to_s
+ end
+
+ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
make_html
end
def make_html
- @html = Nokogiri::HTML(@input, nil, 'UTF-8')
+ @html = Nokogiri::HTML(@input, nil, @options[:encoding])
end
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -219,11 +228,11 @@
end
else
# wrap text nodes in p tags
# elem.children.each do |child|
# if child.text?
-## debug("wrapping text node with a p")
+# debug("wrapping text node with a p")
# child.swap("<p>#{child.text}</p>")
# end
# end
end
end
@@ -236,13 +245,15 @@
node.css("form, object, iframe, embed").each do |elem|
elem.remove
end
- # remove empty <p> tags
- node.css("p").each do |elem|
- elem.remove if elem.content.strip.empty?
+ if @options[:remove_empty_nodes]
+ # remove <p> tags that have no text content - this will also remove p tags that contain only images.
+ node.css("p").each do |elem|
+ elem.remove if elem.content.strip.empty?
+ end
end
# Conditionally clean <table>s, <ul>s, and <div>s
clean_conditionally(node, candidates, "table, ul, div")
@@ -257,21 +268,19 @@
base_whitelist.each {|tag| whitelist[tag] = true }
replace_with_whitespace = Hash.new
base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
([node] + node.css("*")).each do |el|
-
# If element is in whitelist, delete all its attributes
if whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
# Otherwise, replace the element with its contents
else
if replace_with_whitespace[el.node_name]
- # Adding here, because swap removes regular spaaces
- el.swap(' ' << el.text << ' ')
+ el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
else
- el.swap(el.text)
+ el.swap(Nokogiri::XML::Text.new(el.text, el.document))
end
end
end