sanitize.rb in sanitize-1.2.0.dev.20091104

- old
+ new

@@ -27,25 +27,57 @@
 require 'sanitize/config/restricted'
 require 'sanitize/config/basic'
 require 'sanitize/config/relaxed'
 
 class Sanitize
+  attr_reader :config
 
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
   # colon is encoded as an entity and even if it's an incomplete entity (which
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|&#0*58|&#x0*3a)/i
 
   #--
+  # Class Methods
+  #++
+
+  # Returns a sanitized copy of _html_, using the settings in _config_ if
+  # specified.
+  def self.clean(html, config = {})
+    sanitize = Sanitize.new(config)
+    sanitize.clean(html)
+  end
+
+  # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
+  # were made.
+  def self.clean!(html, config = {})
+    sanitize = Sanitize.new(config)
+    sanitize.clean!(html)
+  end
+
+  # Sanitizes the specified Nokogiri::XML::Node and all its children.
+  def self.clean_node!(node, config = {})
+    sanitize = Sanitize.new(config)
+    sanitize.clean_node!(node)
+  end
+
+  #--
   # Instance Methods
   #++
 
   # Returns a new Sanitize object initialized with the settings in _config_.
   def initialize(config = {})
+    # Sanitize configuration.
     @config = Config::DEFAULT.merge(config)
+    @config[:transformers] = Array(@config[:transformers])
+
+    # Specific nodes to whitelist (along with all their attributes). This array
+    # is generated at runtime by transformers, and is cleared before and after
+    # a fragment is cleaned (so it applies only to a specific fragment).
+    @whitelist_nodes = []
   end
 
   # Returns a sanitized copy of _html_.
   def clean(html)
     dupe = html.dup
@@ -53,103 +85,144 @@
   end
 
   # Performs clean in place, returning _html_, or +nil+ if no changes were
   # made.
   def clean!(html)
+    @whitelist_nodes = []
     fragment = Nokogiri::HTML::DocumentFragment.parse(html)
+    clean_node!(fragment)
+    @whitelist_nodes = []
 
-    fragment.traverse do |node|
-      if node.comment?
-        node.unlink unless @config[:allow_comments]
-      elsif node.element?
-        name = node.name.to_s.downcase
-
-        # Delete any element that isn't in the whitelist.
-        unless @config[:elements].include?(name)
-          node.children.each { |n| node.add_previous_sibling(n) }
-          node.unlink
-          next
-        end
-
-        attr_whitelist = ((@config[:attributes][name] || []) +
-            (@config[:attributes][:all] || [])).uniq
-
-        if attr_whitelist.empty?
-          # Delete all attributes from elements with no whitelisted
-          # attributes.
-          node.attribute_nodes.each { |attr| attr.remove }
-        else
-          # Delete any attribute that isn't in the whitelist for this element.
-          node.attribute_nodes.each do |attr|
-            attr.unlink unless attr_whitelist.include?(attr.name.downcase)
-          end
-
-          # Delete remaining attributes that use unacceptable protocols.
-          if @config[:protocols].has_key?(name)
-            protocol = @config[:protocols][name]
-
-            node.attribute_nodes.each do |attr|
-              attr_name = attr.name.downcase
-              next false unless protocol.has_key?(attr_name)
-
-              del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
-                !protocol[attr_name].include?($1.downcase)
-              else
-                !protocol[attr_name].include?(:relative)
-              end
-
-              attr.unlink if del
-            end
-          end
-        end
-
-        # Add required attributes.
-        if @config[:add_attributes].has_key?(name)
-          @config[:add_attributes][name].each do |key, val|
-            node[key] = val
-          end
-        end
-      elsif node.cdata?
-        node.replace(Nokogiri::XML::Text.new(node.text, node.document))
-      end
-    end
-
-    # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
-    # string no matter what we ask for. This will be fixed in 1.4.0, but for
-    # now we have to hack around it to prevent errors.
     output_method_params = {:encoding => 'utf-8', :indent => 0}
+
     if @config[:output] == :xhtml
       output_method = fragment.method(:to_xhtml)
-      output_method_params.merge!(:save_with => Nokogiri::XML::Node::SaveOptions::AS_XHTML)
+      output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
     elsif @config[:output] == :html
       output_method = fragment.method(:to_html)
     else
       raise Error, "unsupported output format: #{@config[:output]}"
     end
 
     result = output_method.call(output_method_params)
+
+    # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
+    # string no matter what we ask for. This will be fixed in 1.4.0, but for
+    # now we have to hack around it to prevent errors.
     result.force_encoding('utf-8') if RUBY_VERSION >= '1.9'
 
     return result == html ? nil : html[0, html.length] = result
   end
 
-  #--
-  # Class Methods
-  #++
+  # Sanitizes the specified Nokogiri::XML::Node and all its children.
+  def clean_node!(node)
+    raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
 
-  class << self
-    # Returns a sanitized copy of _html_, using the settings in _config_ if
-    # specified.
-    def clean(html, config = {})
-      sanitize = Sanitize.new(config)
-      sanitize.clean(html)
+    node.traverse do |traversed_node|
+      if traversed_node.element?
+        clean_element!(traversed_node)
+      elsif traversed_node.comment?
+        traversed_node.unlink unless @config[:allow_comments]
+      elsif traversed_node.cdata?
+        traversed_node.replace(Nokogiri::XML::Text.new(traversed_node.text,
+            traversed_node.document))
+      end
     end
 
-    # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
-    # were made.
-    def clean!(html, config = {})
-      sanitize = Sanitize.new(config)
-      sanitize.clean!(html)
+    node
+  end
+
+  private
+
+  def clean_element!(node)
+    # Run this node through all configured transformers.
+    transform = transform_element!(node)
+
+    # If this node is in the dynamic whitelist array (built at runtime by
+    # transformers), let it live with all of its attributes intact.
+    return if @whitelist_nodes.include?(node)
+
+    name = node.name.to_s.downcase
+
+    # Delete any element that isn't in the whitelist.
+    unless transform[:whitelist] || @config[:elements].include?(name)
+      node.children.each { |n| node.add_previous_sibling(n) }
+      node.unlink
+      return
     end
+
+    attr_whitelist = (transform[:attr_whitelist] +
+        (@config[:attributes][name] || []) +
+        (@config[:attributes][:all] || [])).uniq
+
+    if attr_whitelist.empty?
+      # Delete all attributes from elements with no whitelisted attributes.
+      node.attribute_nodes.each {|attr| attr.remove }
+    else
+      # Delete any attribute that isn't in the whitelist for this element.
+      node.attribute_nodes.each do |attr|
+        attr.unlink unless attr_whitelist.include?(attr.name.downcase)
+      end
+
+      # Delete remaining attributes that use unacceptable protocols.
+      if @config[:protocols].has_key?(name)
+        protocol = @config[:protocols][name]
+
+        node.attribute_nodes.each do |attr|
+          attr_name = attr.name.downcase
+          next false unless protocol.has_key?(attr_name)
+
+          del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
+            !protocol[attr_name].include?($1.downcase)
+          else
+            !protocol[attr_name].include?(:relative)
+          end
+
+          attr.unlink if del
+        end
+      end
+    end
+
+    # Add required attributes.
+    if @config[:add_attributes].has_key?(name)
+      @config[:add_attributes][name].each do |key, val|
+        node[key] = val
+      end
+    end
+
+    transform
   end
 
+  def transform_element!(node)
+    output = {
+      :attr_whitelist => [],
+      :node           => node,
+      :whitelist      => false
+    }
+
+    @config[:transformers].inject(node) do |transformer_node, transformer|
+      transform = transformer.call({
+        :config => @config,
+        :node   => transformer_node
+      })
+
+      if transform.nil?
+        transformer_node
+      elsif transform.is_a?(Hash)
+        if transform[:whitelist_nodes].is_a?(Array)
+          @whitelist_nodes += transform[:whitelist_nodes] 
+          @whitelist_nodes.uniq!
+        end
+
+        output[:attr_whitelist]  += transform[:attr_whitelist] if transform[:attr_whitelist].is_a?(Array)
+        output[:whitelist]      ||= true if transform[:whitelist]
+        output[:node]             = transform[:node].is_a?(Nokogiri::XML::Node) ? transform[:node] : output[:node]
+      else
+        raise Error, "transformer output must be a Hash or nil"
+      end
+    end
+
+    node.replace(output[:node]) if node != output[:node]
+
+    return output
+  end
 end