sanitize.rb in sanitize-1.1.0

- old
+ new

@@ -1,5 +1,6 @@
+# encoding: utf-8
 #--
 # Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the 'Software'), to deal
@@ -18,44 +19,25 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 #++
 
-# Append this file's directory to the include path if it's not there already.
-$:.unshift(File.dirname(File.expand_path(__FILE__)))
-$:.uniq!
-
-require 'rubygems'
-
-gem 'hpricot', '~> 0.8.1'
-
-require 'hpricot'
+require 'nokogiri'
+require 'sanitize/version'
 require 'sanitize/config'
 require 'sanitize/config/restricted'
 require 'sanitize/config/basic'
 require 'sanitize/config/relaxed'
 
 class Sanitize
 
-  # Characters that should be replaced with entities in text nodes.
-  ENTITY_MAP = {
-    '<' => '&lt;',
-    '>' => '&gt;',
-    '"' => '&quot;',
-    "'" => '&#39;'
-  }
-
-  # Matches an unencoded ampersand that is not part of a valid character entity
-  # reference.
-  REGEX_AMPERSAND = /&(?!(?:[a-z]+[0-9]{0,2}|#[0-9]+|#x[0-9a-f]+);)/i
-
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
   # colon is encoded as an entity and even if it's an incomplete entity (which
   # IE6 and Opera will still parse).
-  REGEX_PROTOCOL = /^([^:]*)(?:\:|&#0*58|&#x0*3a)/i
+  REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|&#0*58|&#x0*3a)/i
 
   #--
   # Instance Methods
   #++
 
@@ -71,82 +53,86 @@
   end
 
   # Performs clean in place, returning _html_, or +nil+ if no changes were
   # made.
   def clean!(html)
-    fragment = Hpricot(html)
+    fragment = Nokogiri::HTML::DocumentFragment.parse(html)
 
-    fragment.search('*') do |node|
-      if node.bogusetag? || node.doctype? || node.procins? || node.xmldecl?
-        node.parent.replace_child(node, '')
-        next
-      end
-
+    fragment.traverse do |node|
       if node.comment?
-        node.parent.replace_child(node, '') unless @config[:allow_comments]
-      elsif node.elem?
+        node.unlink unless @config[:allow_comments]
+      elsif node.element?
         name = node.name.to_s.downcase
 
         # Delete any element that isn't in the whitelist.
         unless @config[:elements].include?(name)
-          node.parent.replace_child(node, node.children || '')
+          node.children.each { |n| node.add_previous_sibling(n) }
+          node.unlink
           next
         end
 
-        node.raw_attributes ||= {}
-
         attr_whitelist = ((@config[:attributes][name] || []) +
             (@config[:attributes][:all] || [])).uniq
 
         if attr_whitelist.empty?
           # Delete all attributes from elements with no whitelisted
           # attributes.
-          node.raw_attributes = {}
+          node.attribute_nodes.each { |attr| attr.remove }
         else
           # Delete any attribute that isn't in the whitelist for this element.
-          node.raw_attributes.delete_if do |key, value|
-            !attr_whitelist.include?(key.to_s.downcase)
+          node.attribute_nodes.each do |attr|
+            attr.unlink unless attr_whitelist.include?(attr.name.downcase)
           end
 
           # Delete remaining attributes that use unacceptable protocols.
           if @config[:protocols].has_key?(name)
             protocol = @config[:protocols][name]
 
-            node.raw_attributes.delete_if do |key, value|
-              key = key.to_s.downcase
-              next false unless protocol.has_key?(key)
-              next true if value.nil?
+            node.attribute_nodes.each do |attr|
+              attr_name = attr.name.downcase
+              next false unless protocol.has_key?(attr_name)
 
-              if value.to_s.downcase =~ REGEX_PROTOCOL
-                !protocol[key].include?($1.downcase)
+              del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
+                !protocol[attr_name].include?($1.downcase)
               else
-                !protocol[key].include?(:relative)
+                !protocol[attr_name].include?(:relative)
               end
+
+              attr.unlink if del
             end
           end
         end
 
         # Add required attributes.
         if @config[:add_attributes].has_key?(name)
-          node.raw_attributes.merge!(@config[:add_attributes][name])
+          @config[:add_attributes][name].each do |key, val|
+            node[key] = val
+          end
         end
-
-        # Escape special chars in attribute values.
-        node.raw_attributes.each do |key, value|
-          node.raw_attributes[key] = Sanitize.encode_html(value)
-        end
+      elsif node.cdata?
+        node.replace(Nokogiri::XML::Text.new(node.text, node.document))
       end
     end
 
-    # Make one last pass through the fragment and encode all special HTML chars
-    # as entities. This eliminates certain types of maliciously-malformed nested
-    # tags.
-    fragment.search('*') do |node|
-      node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
+    if @config[:output] == :xhtml
+      output_method = fragment.method(:to_xhtml)
+    elsif @config[:output] == :html
+      output_method = fragment.method(:to_html)
+    else
+      raise Error, "unsupported output format: #{@config[:output]}"
     end
 
-    result = fragment.to_s
+    if RUBY_VERSION >= '1.9'
+      # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
+      # string no matter what we ask for. This will be fixed in 1.4.0, but for
+      # now we have to hack around it to prevent errors.
+      result = output_method.call(:encoding => 'utf-8', :indent => 0).force_encoding('utf-8')
+      result.gsub!(">\n", '>')
+    else
+      result = output_method.call(:encoding => 'utf-8', :indent => 0).gsub(">\n", '>')
+    end
+
     return result == html ? nil : html[0, html.length] = result
   end
 
   #--
   # Class Methods
@@ -163,21 +149,9 @@
     # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
     # were made.
     def clean!(html, config = {})
       sanitize = Sanitize.new(config)
       sanitize.clean!(html)
-    end
-
-    # Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
-    # references and returns the encoded string.
-    def encode_html(html)
-      str = html.dup
-
-      # Encode special chars.
-      ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
-
-      # Convert unencoded ampersands to entity references.
-      str.gsub(REGEX_AMPERSAND, '&amp;')
     end
   end
 
 end