lib/sanitize.rb in sanitize-1.1.1.dev.20091102 vs lib/sanitize.rb in sanitize-1.2.0.dev.20091104

- old
+ new

@@ -27,25 +27,57 @@ require 'sanitize/config/restricted' require 'sanitize/config/basic' require 'sanitize/config/relaxed' class Sanitize + attr_reader :config # Matches an attribute value that could be treated by a browser as a URL # with a protocol prefix, such as "http:" or "javascript:". Any string of zero # or more characters followed by a colon is considered a match, even if the # colon is encoded as an entity and even if it's an incomplete entity (which # IE6 and Opera will still parse). REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|&#0*58|&#x0*3a)/i #-- + # Class Methods + #++ + + # Returns a sanitized copy of _html_, using the settings in _config_ if + # specified. + def self.clean(html, config = {}) + sanitize = Sanitize.new(config) + sanitize.clean(html) + end + + # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes + # were made. + def self.clean!(html, config = {}) + sanitize = Sanitize.new(config) + sanitize.clean!(html) + end + + # Sanitizes the specified Nokogiri::XML::Node and all its children. + def self.clean_node!(node, config = {}) + sanitize = Sanitize.new(config) + sanitize.clean_node!(node) + end + + #-- # Instance Methods #++ # Returns a new Sanitize object initialized with the settings in _config_. def initialize(config = {}) + # Sanitize configuration. @config = Config::DEFAULT.merge(config) + @config[:transformers] = Array(@config[:transformers]) + + # Specific nodes to whitelist (along with all their attributes). This array + # is generated at runtime by transformers, and is cleared before and after + # a fragment is cleaned (so it applies only to a specific fragment). + @whitelist_nodes = [] end # Returns a sanitized copy of _html_. def clean(html) dupe = html.dup @@ -53,103 +85,144 @@ end # Performs clean in place, returning _html_, or +nil+ if no changes were # made. def clean!(html) + @whitelist_nodes = [] fragment = Nokogiri::HTML::DocumentFragment.parse(html) + clean_node!(fragment) + @whitelist_nodes = [] - fragment.traverse do |node| - if node.comment? - node.unlink unless @config[:allow_comments] - elsif node.element? - name = node.name.to_s.downcase - - # Delete any element that isn't in the whitelist. - unless @config[:elements].include?(name) - node.children.each { |n| node.add_previous_sibling(n) } - node.unlink - next - end - - attr_whitelist = ((@config[:attributes][name] || []) + - (@config[:attributes][:all] || [])).uniq - - if attr_whitelist.empty? - # Delete all attributes from elements with no whitelisted - # attributes. - node.attribute_nodes.each { |attr| attr.remove } - else - # Delete any attribute that isn't in the whitelist for this element. - node.attribute_nodes.each do |attr| - attr.unlink unless attr_whitelist.include?(attr.name.downcase) - end - - # Delete remaining attributes that use unacceptable protocols. - if @config[:protocols].has_key?(name) - protocol = @config[:protocols][name] - - node.attribute_nodes.each do |attr| - attr_name = attr.name.downcase - next false unless protocol.has_key?(attr_name) - - del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL - !protocol[attr_name].include?($1.downcase) - else - !protocol[attr_name].include?(:relative) - end - - attr.unlink if del - end - end - end - - # Add required attributes. - if @config[:add_attributes].has_key?(name) - @config[:add_attributes][name].each do |key, val| - node[key] = val - end - end - elsif node.cdata? - node.replace(Nokogiri::XML::Text.new(node.text, node.document)) - end - end - - # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII - # string no matter what we ask for. This will be fixed in 1.4.0, but for - # now we have to hack around it to prevent errors. output_method_params = {:encoding => 'utf-8', :indent => 0} + if @config[:output] == :xhtml output_method = fragment.method(:to_xhtml) - output_method_params.merge!(:save_with => Nokogiri::XML::Node::SaveOptions::AS_XHTML) + output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML elsif @config[:output] == :html output_method = fragment.method(:to_html) else raise Error, "unsupported output format: #{@config[:output]}" end result = output_method.call(output_method_params) + + # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII + # string no matter what we ask for. This will be fixed in 1.4.0, but for + # now we have to hack around it to prevent errors. result.force_encoding('utf-8') if RUBY_VERSION >= '1.9' return result == html ? nil : html[0, html.length] = result end - #-- - # Class Methods - #++ + # Sanitizes the specified Nokogiri::XML::Node and all its children. + def clean_node!(node) + raise ArgumentError unless node.is_a?(Nokogiri::XML::Node) - class << self - # Returns a sanitized copy of _html_, using the settings in _config_ if - # specified. - def clean(html, config = {}) - sanitize = Sanitize.new(config) - sanitize.clean(html) + node.traverse do |traversed_node| + if traversed_node.element? + clean_element!(traversed_node) + elsif traversed_node.comment? + traversed_node.unlink unless @config[:allow_comments] + elsif traversed_node.cdata? + traversed_node.replace(Nokogiri::XML::Text.new(traversed_node.text, + traversed_node.document)) + end end - # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes - # were made. - def clean!(html, config = {}) - sanitize = Sanitize.new(config) - sanitize.clean!(html) + node + end + + private + + def clean_element!(node) + # Run this node through all configured transformers. + transform = transform_element!(node) + + # If this node is in the dynamic whitelist array (built at runtime by + # transformers), let it live with all of its attributes intact. + return if @whitelist_nodes.include?(node) + + name = node.name.to_s.downcase + + # Delete any element that isn't in the whitelist. + unless transform[:whitelist] || @config[:elements].include?(name) + node.children.each { |n| node.add_previous_sibling(n) } + node.unlink + return end + + attr_whitelist = (transform[:attr_whitelist] + + (@config[:attributes][name] || []) + + (@config[:attributes][:all] || [])).uniq + + if attr_whitelist.empty? + # Delete all attributes from elements with no whitelisted attributes. + node.attribute_nodes.each {|attr| attr.remove } + else + # Delete any attribute that isn't in the whitelist for this element. + node.attribute_nodes.each do |attr| + attr.unlink unless attr_whitelist.include?(attr.name.downcase) + end + + # Delete remaining attributes that use unacceptable protocols. + if @config[:protocols].has_key?(name) + protocol = @config[:protocols][name] + + node.attribute_nodes.each do |attr| + attr_name = attr.name.downcase + next false unless protocol.has_key?(attr_name) + + del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL + !protocol[attr_name].include?($1.downcase) + else + !protocol[attr_name].include?(:relative) + end + + attr.unlink if del + end + end + end + + # Add required attributes. + if @config[:add_attributes].has_key?(name) + @config[:add_attributes][name].each do |key, val| + node[key] = val + end + end + + transform end + def transform_element!(node) + output = { + :attr_whitelist => [], + :node => node, + :whitelist => false + } + + @config[:transformers].inject(node) do |transformer_node, transformer| + transform = transformer.call({ + :config => @config, + :node => transformer_node + }) + + if transform.nil? + transformer_node + elsif transform.is_a?(Hash) + if transform[:whitelist_nodes].is_a?(Array) + @whitelist_nodes += transform[:whitelist_nodes] + @whitelist_nodes.uniq! + end + + output[:attr_whitelist] += transform[:attr_whitelist] if transform[:attr_whitelist].is_a?(Array) + output[:whitelist] ||= true if transform[:whitelist] + output[:node] = transform[:node].is_a?(Nokogiri::XML::Node) ? transform[:node] : output[:node] + else + raise Error, "transformer output must be a Hash or nil" + end + end + + node.replace(output[:node]) if node != output[:node] + + return output + end end