sanitize.rb in sanitize-1.0.6

- old
+ new

@@ -24,49 +24,40 @@
 $:.unshift(File.dirname(File.expand_path(__FILE__)))
 $:.uniq!
 
 require 'rubygems'
 
-gem 'hpricot',      '~> 0.6'
-gem 'htmlentities', '~> 4.0.0'
+gem 'hpricot', '~> 0.6'
 
 require 'hpricot'
-require 'htmlentities'
 require 'sanitize/config'
 require 'sanitize/config/restricted'
 require 'sanitize/config/basic'
 require 'sanitize/config/relaxed'
-require 'sanitize/monkeypatch/hpricot'
 
 class Sanitize
 
+  # Characters that should be replaced with entities in text nodes.
+  ENTITY_MAP = {
+    '<' => '&lt;',
+    '>' => '&gt;',
+    '"' => '&quot;',
+    "'" => '&#39;'
+  }
+
+  # Matches an unencoded ampersand that is not part of a valid character entity
+  # reference.
+  REGEX_AMPERSAND = /&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)/i
+
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
   # colon is encoded as an entity and even if it's an incomplete entity (which
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /^([^:]*)(?:\:|&#0*58|&#x0*3a)/i
 
   #--
-  # Class Methods
-  #++
-
-  # Returns a sanitized copy of _html_, using the settings in _config_ if
-  # specified.
-  def self.clean(html, config = {})
-    sanitize = Sanitize.new(config)
-    sanitize.clean(html)
-  end
-
-  # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
-  # were made.
-  def self.clean!(html, config = {})
-    sanitize = Sanitize.new(config)
-    sanitize.clean!(html)
-  end
-
-  #--
   # Instance Methods
   #++
 
   # Returns a new Sanitize object initialized with the settings in _config_.
   def initialize(config = {})
@@ -99,14 +90,23 @@
         unless @config[:elements].include?(name)
           node.parent.replace_child(node, node.children || '')
           next
         end
 
-        if @config[:attributes].has_key?(name)
+        node.raw_attributes ||= {}
+
+        attr_whitelist = ((@config[:attributes][name] || []) +
+            (@config[:attributes][:all] || [])).uniq
+
+        if attr_whitelist.empty?
+          # Delete all attributes from elements with no whitelisted
+          # attributes.
+          node.raw_attributes = {}
+        else
           # Delete any attribute that isn't in the whitelist for this element.
           node.raw_attributes.delete_if do |key, value|
-            !@config[:attributes][name].include?(key.to_s.downcase)
+            !attr_whitelist.include?(key.to_s.downcase)
           end
 
           # Delete remaining attributes that use unacceptable protocols.
           if @config[:protocols].has_key?(name)
             protocol = @config[:protocols][name]
@@ -120,34 +120,63 @@
               else
                 !protocol[key].include?(:relative)
               end
             end
           end
-        else
-          # Delete all attributes from elements with no whitelisted
-          # attributes.
-          node.raw_attributes = {}
         end
 
         # Add required attributes.
         if @config[:add_attributes].has_key?(name)
           node.raw_attributes.merge!(@config[:add_attributes][name])
         end
+
+        # Escape special chars in attribute values.
+        node.raw_attributes.each do |key, value|
+          node.raw_attributes[key] = Sanitize.encode_html(value)
+        end
       end
     end
 
     # Make one last pass through the fragment and encode all special HTML chars
-    # and non-ASCII chars as entities. This eliminates certain types of
-    # maliciously-malformed nested tags and also compensates for Hpricot's
-    # burning desire to decode all entities.
-    coder = HTMLEntities.new
-
-    fragment.traverse_element do |node|
-      if node.text?
-        node.swap(coder.encode(node.inner_text, :named))
-      end
+    # as entities. This eliminates certain types of maliciously-malformed nested
+    # tags.
+    fragment.search('*') do |node|
+      node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
     end
 
     result = fragment.to_s
     return result == html ? nil : html[0, html.length] = result
   end
+
+  #--
+  # Class Methods
+  #++
+
+  class << self
+    # Returns a sanitized copy of _html_, using the settings in _config_ if
+    # specified.
+    def clean(html, config = {})
+      sanitize = Sanitize.new(config)
+      sanitize.clean(html)
+    end
+
+    # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
+    # were made.
+    def clean!(html, config = {})
+      sanitize = Sanitize.new(config)
+      sanitize.clean!(html)
+    end
+
+    # Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
+    # references and returns the encoded string.
+    def encode_html(html)
+      str = html.dup
+
+      # Encode special chars.
+      ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
+
+      # Convert unencoded ampersands to entity references.
+      str.gsub(REGEX_AMPERSAND, '&amp;')
+    end
+  end
+
 end