lib/sanitize.rb in sanitize-1.0.5 vs lib/sanitize.rb in sanitize-1.0.6

- old
+ new

@@ -24,49 +24,40 @@ $:.unshift(File.dirname(File.expand_path(__FILE__))) $:.uniq! require 'rubygems' -gem 'hpricot', '~> 0.6' -gem 'htmlentities', '~> 4.0.0' +gem 'hpricot', '~> 0.6' require 'hpricot' -require 'htmlentities' require 'sanitize/config' require 'sanitize/config/restricted' require 'sanitize/config/basic' require 'sanitize/config/relaxed' -require 'sanitize/monkeypatch/hpricot' class Sanitize + # Characters that should be replaced with entities in text nodes. + ENTITY_MAP = { + '<' => '&lt;', + '>' => '&gt;', + '"' => '&quot;', + "'" => '&#39;' + } + + # Matches an unencoded ampersand that is not part of a valid character entity + # reference. + REGEX_AMPERSAND = /&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)/i + # Matches an attribute value that could be treated by a browser as a URL # with a protocol prefix, such as "http:" or "javascript:". Any string of zero # or more characters followed by a colon is considered a match, even if the # colon is encoded as an entity and even if it's an incomplete entity (which # IE6 and Opera will still parse). REGEX_PROTOCOL = /^([^:]*)(?:\:|&#0*58|&#x0*3a)/i #-- - # Class Methods - #++ - - # Returns a sanitized copy of _html_, using the settings in _config_ if - # specified. - def self.clean(html, config = {}) - sanitize = Sanitize.new(config) - sanitize.clean(html) - end - - # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes - # were made. - def self.clean!(html, config = {}) - sanitize = Sanitize.new(config) - sanitize.clean!(html) - end - - #-- # Instance Methods #++ # Returns a new Sanitize object initialized with the settings in _config_. def initialize(config = {}) @@ -99,14 +90,23 @@ unless @config[:elements].include?(name) node.parent.replace_child(node, node.children || '') next end - if @config[:attributes].has_key?(name) + node.raw_attributes ||= {} + + attr_whitelist = ((@config[:attributes][name] || []) + + (@config[:attributes][:all] || [])).uniq + + if attr_whitelist.empty? + # Delete all attributes from elements with no whitelisted + # attributes. + node.raw_attributes = {} + else # Delete any attribute that isn't in the whitelist for this element. node.raw_attributes.delete_if do |key, value| - !@config[:attributes][name].include?(key.to_s.downcase) + !attr_whitelist.include?(key.to_s.downcase) end # Delete remaining attributes that use unacceptable protocols. if @config[:protocols].has_key?(name) protocol = @config[:protocols][name] @@ -120,34 +120,63 @@ else !protocol[key].include?(:relative) end end end - else - # Delete all attributes from elements with no whitelisted - # attributes. - node.raw_attributes = {} end # Add required attributes. if @config[:add_attributes].has_key?(name) node.raw_attributes.merge!(@config[:add_attributes][name]) end + + # Escape special chars in attribute values. + node.raw_attributes.each do |key, value| + node.raw_attributes[key] = Sanitize.encode_html(value) + end end end # Make one last pass through the fragment and encode all special HTML chars - # and non-ASCII chars as entities. This eliminates certain types of - # maliciously-malformed nested tags and also compensates for Hpricot's - # burning desire to decode all entities. - coder = HTMLEntities.new - - fragment.traverse_element do |node| - if node.text? - node.swap(coder.encode(node.inner_text, :named)) - end + # as entities. This eliminates certain types of maliciously-malformed nested + # tags. + fragment.search('*') do |node| + node.swap(Sanitize.encode_html(node.to_original_html)) if node.text? end result = fragment.to_s return result == html ? nil : html[0, html.length] = result end + + #-- + # Class Methods + #++ + + class << self + # Returns a sanitized copy of _html_, using the settings in _config_ if + # specified. + def clean(html, config = {}) + sanitize = Sanitize.new(config) + sanitize.clean(html) + end + + # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes + # were made. + def clean!(html, config = {}) + sanitize = Sanitize.new(config) + sanitize.clean!(html) + end + + # Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity + # references and returns the encoded string. + def encode_html(html) + str = html.dup + + # Encode special chars. + ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) } + + # Convert unencoded ampersands to entity references. + str.gsub(REGEX_AMPERSAND, '&amp;') + end + end + end