s that do not contain other block elements into

s if elem.inner_html !~ REGEXES[:divToPElementsRe] debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p"); elem.name = "p" end end end end def sanitize(node, candidates, options = {}) node.css("h1, h2, h3, h4, h5, h6").each do |header| header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 end node.css("form").each do |elem| elem.remove end node.css("iframe").each do |iframe| unless iframe.attr("src").to_s =~ REGEXES[:videoRe] iframe.remove end end # remove empty

tags # node.css("p").each do |elem| # elem.remove if elem.content.strip.empty? # end # Conditionally clean s,
s, and
s node.css("table, ul, div").each do |el| weight = class_weight(el) content_score = candidates[el] ? candidates[el][:content_score] : 0 name = el.name.downcase if weight + content_score < 0 el.remove debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") elsif el.text.count(",") < 10 counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } counts["li"] -= 100 content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) to_remove = false reason = "" if (counts["img"] > counts["p"]) && (counts["p"] > 0) reason = "too many images #{counts['p']}" to_remove = true elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" reason = "more
s than
s" to_remove = true elsif counts["input"] > (counts["p"] / 3).to_i reason = "less than 3x
s than s" to_remove = true elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2) reason = "too short a content length without a single image" to_remove = true elsif weight < 25 && link_density > 0.2 reason = "too many links for its weight (#{weight})" to_remove = true elsif weight >= 25 && link_density > 0.5 reason = "too many links for its weight (#{weight})" to_remove = true elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 reason = "s with too short a content length, or too many s" to_remove = true end if to_remove debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") el.remove end end end # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] # Use a hash for speed (don't want to make a million calls to include?) whitelist = Hash.new base_whitelist.each {|tag| whitelist[tag] = true } ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes if whitelist[el.node_name] el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } # Otherwise, replace the element with its contents else # keep getting whiny nils with nokogiri el.swap(el.text) rescue nil end end # Get rid of duplicate whitespace node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ") end private def rules if @base_uri =~ /^www\.(.*)$/ @base_uri = $1 end @rules ||= YAML.load_file(options[:exceptions_file] || File.dirname(__FILE__) + "/../special_rules.yml")["sites"] end def apply_custom_rule debug "Applying custom selector for : " + rules[@base_uri]['name'] extracted = @document.css(rules[@base_uri]["css"]) extracted.each do |elem| if (elem.try(:inner_html) =~ /^\W*$/) extracted.delete elem end end extracted end end private def remove_empty_tags(chunk) chunk.css("p").each do |elem| elem.remove if elem.content.strip.empty? end end end