').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] @clean_conditionally = @options[:clean_conditionally] @best_candidate_has_image = true make_html end def prepare_candidates @html.css("script, style").each { |i| i.remove } remove_unlikely_candidates! if @remove_unlikely_candidates transform_misused_divs_into_paragraphs! @candidates = score_paragraphs(options[:min_text_length]) @best_candidate = select_best_candidate(@candidates) end def make_html @html = Nokogiri::HTML(@input, nil, @options[:encoding]) # In case document has no body, such as from empty string or redirect @html = Nokogiri::HTML('', nil, @options[:encoding]) if @html.css('body').length == 0 # Remove html comment tags @html.xpath('//comment()').each { |i| i.remove } end def images(content=nil, reload=false) begin require 'mini_magick' rescue LoadError raise "Please install mini_magick in order to use the #images feature." end @best_candidate_has_image = false if reload prepare_candidates list_images = [] tested_images = [] content = @best_candidate[:elem] unless reload return list_images if content.nil? elements = content.css("img").map(&:attributes) elements.each do |element| next unless element["src"] url = element["src"].value height = element["height"].nil? ? 0 : element["height"].value.to_i width = element["width"].nil? ? 0 : element["width"].value.to_i format = File.extname(url).gsub(".", "") image = {:width => width, :height => height, :format => format} image = load_image(url) if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?) next unless image if tested_images.include?(url) debug("Image was tested: #{url}") next end tested_images.push(url) if image_meets_criteria?(image) list_images << url else debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}") end end (list_images.empty? and content != @html) ? images(@html, true) : list_images end def load_image(url) begin MiniMagick::Image.open(url) rescue => e debug("Image error: #{e}") nil end end def image_meets_criteria?(image) return false if options[:ignore_image_format].include?(image[:format].downcase) image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0) end REGEXES = { :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, :replaceBrsRe => /(]*>[ \n\r\t]*){2,}/i, :replaceFontsRe => /<(\/?)font[^>]*>/i, :trimRe => /^\s+|\s+$/, :normalizeRe => /\s{2,}/, :killBreaksRe => /((\s| ?)*){1,}/, :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i } def title title = @html.css("title").first title ? title.text : nil end # Look through the @html document looking for the author # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted) # Returns nil if no author is detected def author # Let's grab this author: # author_elements = @html.xpath('//meta[@name = "dc.creator"]') unless author_elements.empty? author_elements.each do |element| if element['content'] return element['content'].strip end end end # Now let's try to grab this # ByAustin Fonacier #

s that do not contain other block elements into

s if elem.inner_html !~ REGEXES[:divToPElementsRe] debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p"); elem.name = "p" end else # wrap text nodes in p tags # elem.children.each do |child| # if child.text? # debug("wrapping text node with a p") # child.swap("

#{child.text}

") # end # end end end end def sanitize(node, candidates, options = {}) node.css("h1, h2, h3, h4, h5, h6").each do |header| header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 end node.css("form, object, iframe, embed").each do |elem| elem.remove end if @options[:remove_empty_nodes] # remove

tags that have no text content - this will also remove p tags that contain only images. node.css("p").each do |elem| elem.remove if elem.content.strip.empty? end end # Conditionally clean s,
s, and
s clean_conditionally(node, candidates, "table, ul, div") # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] # We'll add whitespace instead of block elements, # so a
b will have a nice space between them base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] # Use a hash for speed (don't want to make a million calls to include?) whitelist = Hash.new base_whitelist.each {|tag| whitelist[tag] = true } replace_with_whitespace = Hash.new base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true } ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes if whitelist[el.node_name] el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } # Otherwise, replace the element with its contents else if replace_with_whitespace[el.node_name] el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document)) else el.swap(Nokogiri::XML::Text.new(el.text, el.document)) end end end # Get rid of duplicate whitespace node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ") end def clean_conditionally(node, candidates, selector) return unless @clean_conditionally node.css(selector).each do |el| weight = class_weight(el) content_score = candidates[el] ? candidates[el][:content_score] : 0 name = el.name.downcase if weight + content_score < 0 el.remove debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") elsif el.text.count(",") < 10 counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } counts["li"] -= 100 content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) to_remove = false reason = "" if counts["img"] > counts["p"] reason = "too many images" to_remove = true elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" reason = "more
s than
s" to_remove = true elsif counts["input"] > (counts["p"] / 3).to_i reason = "less than 3x
s than s" to_remove = true elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2) reason = "too short a content length without a single image" to_remove = true elsif weight < 25 && link_density > 0.2 reason = "too many links for its weight (#{weight})" to_remove = true elsif weight >= 25 && link_density > 0.5 reason = "too many links for its weight (#{weight})" to_remove = true elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 reason = "s with too short a content length, or too many s" to_remove = true end if to_remove debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") el.remove end end end end end end