lib/readability.rb in ruby-readability-0.6.2 vs lib/readability.rb in ruby-readability-0.7.0

- old
+ new

@@ -17,11 +17,25 @@ :min_image_height => 80, :ignore_image_format => [], :blacklist => nil, :whitelist => nil }.freeze - + + REGEXES = { + :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, + :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, + :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, + :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, + :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, + :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i, + :replaceFontsRe => /<(\/?)font[^>]*>/i, + :trimRe => /^\s+|\s+$/, + :normalizeRe => /\s{2,}/, + :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/, + :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i + } + attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image def initialize(input, options = {}) @options = DEFAULT_OPTIONS.merge(options) @input = input @@ -127,11 +141,36 @@ end end (list_images.empty? and content != @html) ? images(@html, true) : list_images end + + def images_with_fqdn_uris!(source_uri) + images_with_fqdn_uris(@html, source_uri) + end + + def images_with_fqdn_uris(document = @html.dup, source_uri) + uri = URI.parse(source_uri) + host = uri.host + scheme = uri.scheme + port = uri.port # defaults to 80 + base = "#{scheme}://#{host}:#{port}/" + + images = [] + document.css("img").each do |elem| + begin + elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil + images << elem['src'].to_s + rescue URI::InvalidURIError => exc + elem.remove + end + end + + images(document,true) + end + def get_image_size(url) w, h = FastImage.size(url) raise "Couldn't get size." if w.nil? || h.nil? {:width => w, :height => h} rescue => e @@ -142,24 +181,10 @@ def image_meets_criteria?(image) return false if options[:ignore_image_format].include?(image[:format].downcase) image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0) end - REGEXES = { - :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, - :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, - :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, - :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, - :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, - :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i, - :replaceFontsRe => /<(\/?)font[^>]*>/i, - :trimRe => /^\s+|\s+$/, - :normalizeRe => /\s{2,}/, - :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/, - :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i - } - def title title = @html.css("title").first title ? title.text : nil end @@ -442,18 +467,21 @@ return unless @clean_conditionally node.css(selector).each do |el| weight = class_weight(el) content_score = candidates[el] ? candidates[el][:content_score] : 0 name = el.name.downcase - + if weight + content_score < 0 el.remove debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") elsif el.text.count(",") < 10 counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } counts["li"] -= 100 + # For every img under a noscript tag discount one from the count to avoid double counting + counts["img"] -= el.css("noscript").css("img").length + content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) if reason @@ -463,16 +491,16 @@ end end end def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) - if counts["img"] > counts["p"] + if (counts["img"] > counts["p"]) && (counts["img"] > 1) "too many images" elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" "more <li>s than <p>s" elsif counts["input"] > (counts["p"] / 3).to_i "less than 3x <p>s than <input>s" - elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2) + elsif (content_length < options[:min_text_length]) && (counts["img"] != 1) "too short a content length without a single image" elsif weight < 25 && link_density > 0.2 "too many links for its weight (#{weight})" elsif weight >= 25 && link_density > 0.5 "too many links for its weight (#{weight})"