lib/readability.rb in ruby-readability-0.2.3 vs lib/readability.rb in ruby-readability-0.3.0.pre

- old
+ new

@@ -1,31 +1,40 @@ require 'rubygems' require 'nokogiri' +require 'guess_html_encoding' module Readability class Document DEFAULT_OPTIONS = { :retry_length => 250, :min_text_length => 25, :remove_unlikely_candidates => true, :weight_classes => true, - :clean_conditionally => true + :clean_conditionally => true, + :remove_empty_nodes => true }.freeze attr_accessor :options, :html def initialize(input, options = {}) - @input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>') @options = DEFAULT_OPTIONS.merge(options) + @input = input + + if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding] + @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding] + @options[:encoding] = @input.encoding.to_s + end + + @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] @clean_conditionally = @options[:clean_conditionally] make_html end def make_html - @html = Nokogiri::HTML(@input, nil, 'UTF-8') + @html = Nokogiri::HTML(@input, nil, @options[:encoding]) end REGEXES = { :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, @@ -219,11 +228,11 @@ end else # wrap text nodes in p tags # elem.children.each do |child| # if child.text? -## debug("wrapping text node with a p") +# debug("wrapping text node with a p") # child.swap("<p>#{child.text}</p>") # end # end end end @@ -236,13 +245,15 @@ node.css("form, object, iframe, embed").each do |elem| elem.remove end - # remove empty <p> tags - node.css("p").each do |elem| - elem.remove if elem.content.strip.empty? + if @options[:remove_empty_nodes] + # remove <p> tags that have no text content - this will also remove p tags that contain only images. + node.css("p").each do |elem| + elem.remove if elem.content.strip.empty? + end end # Conditionally clean <table>s, <ul>s, and <div>s clean_conditionally(node, candidates, "table, ul, div") @@ -257,21 +268,19 @@ base_whitelist.each {|tag| whitelist[tag] = true } replace_with_whitespace = Hash.new base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true } ([node] + node.css("*")).each do |el| - # If element is in whitelist, delete all its attributes if whitelist[el.node_name] el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } # Otherwise, replace the element with its contents else if replace_with_whitespace[el.node_name] - # Adding &nbsp; here, because swap removes regular spaaces - el.swap('&nbsp;' << el.text << '&nbsp;') + el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document)) else - el.swap(el.text) + el.swap(Nokogiri::XML::Text.new(el.text, el.document)) end end end