lib/sanitize.rb in sanitize-1.0.5 vs lib/sanitize.rb in sanitize-1.0.6
- old
+ new
@@ -24,49 +24,40 @@
$:.unshift(File.dirname(File.expand_path(__FILE__)))
$:.uniq!
require 'rubygems'
-gem 'hpricot', '~> 0.6'
-gem 'htmlentities', '~> 4.0.0'
+gem 'hpricot', '~> 0.6'
require 'hpricot'
-require 'htmlentities'
require 'sanitize/config'
require 'sanitize/config/restricted'
require 'sanitize/config/basic'
require 'sanitize/config/relaxed'
-require 'sanitize/monkeypatch/hpricot'
class Sanitize
+ # Characters that should be replaced with entities in text nodes.
+ ENTITY_MAP = {
+ '<' => '<',
+ '>' => '>',
+ '"' => '"',
+ "'" => '''
+ }
+
+ # Matches an unencoded ampersand that is not part of a valid character entity
+ # reference.
+ REGEX_AMPERSAND = /&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)/i
+
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /^([^:]*)(?:\:|�*58|�*3a)/i
#--
- # Class Methods
- #++
-
- # Returns a sanitized copy of _html_, using the settings in _config_ if
- # specified.
- def self.clean(html, config = {})
- sanitize = Sanitize.new(config)
- sanitize.clean(html)
- end
-
- # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
- # were made.
- def self.clean!(html, config = {})
- sanitize = Sanitize.new(config)
- sanitize.clean!(html)
- end
-
- #--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
@@ -99,14 +90,23 @@
unless @config[:elements].include?(name)
node.parent.replace_child(node, node.children || '')
next
end
- if @config[:attributes].has_key?(name)
+ node.raw_attributes ||= {}
+
+ attr_whitelist = ((@config[:attributes][name] || []) +
+ (@config[:attributes][:all] || [])).uniq
+
+ if attr_whitelist.empty?
+ # Delete all attributes from elements with no whitelisted
+ # attributes.
+ node.raw_attributes = {}
+ else
# Delete any attribute that isn't in the whitelist for this element.
node.raw_attributes.delete_if do |key, value|
- !@config[:attributes][name].include?(key.to_s.downcase)
+ !attr_whitelist.include?(key.to_s.downcase)
end
# Delete remaining attributes that use unacceptable protocols.
if @config[:protocols].has_key?(name)
protocol = @config[:protocols][name]
@@ -120,34 +120,63 @@
else
!protocol[key].include?(:relative)
end
end
end
- else
- # Delete all attributes from elements with no whitelisted
- # attributes.
- node.raw_attributes = {}
end
# Add required attributes.
if @config[:add_attributes].has_key?(name)
node.raw_attributes.merge!(@config[:add_attributes][name])
end
+
+ # Escape special chars in attribute values.
+ node.raw_attributes.each do |key, value|
+ node.raw_attributes[key] = Sanitize.encode_html(value)
+ end
end
end
# Make one last pass through the fragment and encode all special HTML chars
- # and non-ASCII chars as entities. This eliminates certain types of
- # maliciously-malformed nested tags and also compensates for Hpricot's
- # burning desire to decode all entities.
- coder = HTMLEntities.new
-
- fragment.traverse_element do |node|
- if node.text?
- node.swap(coder.encode(node.inner_text, :named))
- end
+ # as entities. This eliminates certain types of maliciously-malformed nested
+ # tags.
+ fragment.search('*') do |node|
+ node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
end
result = fragment.to_s
return result == html ? nil : html[0, html.length] = result
end
+
+ #--
+ # Class Methods
+ #++
+
+ class << self
+ # Returns a sanitized copy of _html_, using the settings in _config_ if
+ # specified.
+ def clean(html, config = {})
+ sanitize = Sanitize.new(config)
+ sanitize.clean(html)
+ end
+
+ # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
+ # were made.
+ def clean!(html, config = {})
+ sanitize = Sanitize.new(config)
+ sanitize.clean!(html)
+ end
+
+ # Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
+ # references and returns the encoded string.
+ def encode_html(html)
+ str = html.dup
+
+ # Encode special chars.
+ ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
+
+ # Convert unencoded ampersands to entity references.
+ str.gsub(REGEX_AMPERSAND, '&')
+ end
+ end
+
end