lib/sanitize/transformers/clean_element.rb in sanitize-2.1.1 vs lib/sanitize/transformers/clean_element.rb in sanitize-3.0.0
- old
+ new
@@ -1,155 +1,125 @@
-class Sanitize; module Transformers
+# encoding: utf-8
- class CleanElement
+require 'set'
- # Attributes that need additional escaping on `<a>` elements due to unsafe
- # libxml2 behavior.
- UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
- name
- ])
+class Sanitize; module Transformers; class CleanElement
- # Attributes that need additional escaping on all elements due to unsafe
- # libxml2 behavior.
- UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
- action
- href
- src
- ])
+ # Matches a valid HTML5 data attribute name. The unicode ranges included here
+ # are a conservative subset of the full range of characters that are
+ # technically allowed, with the intent of matching the most common characters
+ # used in data attribute names while excluding uncommon or potentially
+ # misleading characters, or characters with the potential to be normalized
+ # into unsafe or confusing forms.
+ #
+ # If you need data attr names with characters that aren't included here (such
+ # as combining marks, full-width characters, or CJK), please consider creating
+ # a custom transformer to validate attributes according to your needs.
+ #
+ # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
+ REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
- # Mapping of original characters to escape sequences for characters that
- # should be escaped in attributes affected by unsafe libxml2 behavior.
- UNSAFE_LIBXML_ESCAPE_CHARS = {
- ' ' => '%20',
- '"' => '%22'
- }
+ def initialize(config)
+ @add_attributes = config[:add_attributes]
+ @attributes = config[:attributes].dup
+ @elements = config[:elements]
+ @protocols = config[:protocols]
+ @remove_all_contents = false
+ @remove_element_contents = Set.new
+ @whitespace_elements = {}
- # Regex that matches any single character that needs to be escaped in
- # attributes affected by unsafe libxml2 behavior.
- UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
+ @attributes.each do |element_name, attrs|
+ unless element_name == :all
+ @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
+ end
+ end
- def initialize(config)
- @config = config
-
- # For faster lookups.
- @add_attributes = config[:add_attributes]
- @allowed_elements = Set.new(config[:elements])
- @attributes = config[:attributes]
- @protocols = config[:protocols]
- @remove_all_contents = false
- @remove_element_contents = Set.new
- @whitespace_elements = Set.new(config[:whitespace_elements])
-
- if config[:remove_contents].is_a?(Array)
- @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
- else
- @remove_all_contents = !!config[:remove_contents]
+ # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
+ if config[:whitespace_elements].is_a?(Set)
+ config[:whitespace_elements].each do |element|
+ @whitespace_elements[element] = {:before => ' ', :after => ' '}
end
+ else
+ @whitespace_elements = config[:whitespace_elements]
end
- def call(env)
- name = env[:node_name]
- node = env[:node]
+ if config[:remove_contents].is_a?(Set)
+ @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
+ else
+ @remove_all_contents = !!config[:remove_contents]
+ end
+ end
- return if env[:is_whitelisted] || !node.element?
+ def call(env)
+ node = env[:node]
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
- # Delete any element that isn't in the config whitelist.
- unless @allowed_elements.include?(name)
- # Elements like br, div, p, etc. need to be replaced with whitespace in
- # order to preserve readability.
- if @whitespace_elements.include?(name)
- node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document))
+ name = env[:node_name]
- unless node.children.empty?
- node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
- end
- end
+ # Delete any element that isn't in the config whitelist.
+ unless @elements.include?(name)
+ # Elements like br, div, p, etc. need to be replaced with whitespace in
+ # order to preserve readability.
+ if @whitespace_elements.include?(name)
+ node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
- unless @remove_all_contents || @remove_element_contents.include?(name)
- node.children.each {|n| node.add_previous_sibling(n) }
+ unless node.children.empty?
+ node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
end
+ end
- node.unlink
- return
+ unless @remove_all_contents || @remove_element_contents.include?(name)
+ node.children.each {|n| node.add_previous_sibling(n) }
end
- attr_whitelist = Set.new((@attributes[name] || []) +
- (@attributes[:all] || []))
+ node.unlink
+ return
+ end
+ attr_whitelist = @attributes[name] || @attributes[:all]
+
+ if attr_whitelist.nil?
+ # Delete all attributes from elements with no whitelisted attributes.
+ node.attribute_nodes.each {|attr| attr.unlink }
+ else
allow_data_attributes = attr_whitelist.include?(:data)
- if attr_whitelist.empty?
- # Delete all attributes from elements with no whitelisted attributes.
- node.attribute_nodes.each {|attr| attr.unlink }
- else
- # Delete any attribute that isn't allowed on this element.
- node.attribute_nodes.each do |attr|
- attr_name = attr.name.downcase
+ # Delete any attribute that isn't allowed on this element.
+ node.attribute_nodes.each do |attr|
+ attr_name = attr.name.downcase
- unless attr_whitelist.include?(attr_name)
- # The attribute isn't explicitly whitelisted.
+ if attr_whitelist.include?(attr_name)
+ # The attribute is whitelisted.
- if allow_data_attributes && attr_name.start_with?('data-')
- # Arbitrary data attributes are allowed. Verify that the attribute
- # is a valid data attribute.
- attr.unlink unless attr_name =~ REGEX_DATA_ATTR
+ # Remove any attributes that use unacceptable protocols.
+ if @protocols.include?(name) && @protocols[name].include?(attr_name)
+ attr_protocols = @protocols[name][attr_name]
+
+ if attr.value.to_s.downcase =~ REGEX_PROTOCOL
+ attr.unlink unless attr_protocols.include?($1.downcase)
else
- # Either the attribute isn't a data attribute, or arbitrary data
- # attributes aren't allowed. Remove the attribute.
- attr.unlink
+ attr.unlink unless attr_protocols.include?(:relative)
end
end
- end
+ else
+ # The attribute isn't whitelisted.
- # Delete remaining attributes that use unacceptable protocols.
- if @protocols.has_key?(name)
- protocol = @protocols[name]
-
- node.attribute_nodes.each do |attr|
- attr_name = attr.name.downcase
- next false unless protocol.has_key?(attr_name)
-
- del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
- !protocol[attr_name].include?($1.downcase)
- else
- !protocol[attr_name].include?(:relative)
- end
-
- if del
- attr.unlink
- else
- # Leading and trailing whitespace around URLs is ignored at parse
- # time. Stripping it here prevents it from being escaped by the
- # libxml2 workaround below.
- attr.value = attr.value.strip
- end
+ if allow_data_attributes && attr_name.start_with?('data-')
+ # Arbitrary data attributes are allowed. Verify that the attribute
+ # is a valid data attribute.
+ attr.unlink unless attr_name =~ REGEX_DATA_ATTR
+ else
+ # Either the attribute isn't a data attribute, or arbitrary data
+ # attributes aren't allowed. Remove the attribute.
+ attr.unlink
end
end
end
+ end
- # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
- # attempt to preserve server-side includes. This can result in XSS since
- # an unescaped double quote can allow an attacker to inject a
- # non-whitelisted attribute.
- #
- # Sanitize works around this by implementing its own escaping for
- # affected attributes, some of which can exist on any element and some
- # of which can only exist on `<a>` elements.
- #
- # The relevant libxml2 code is here:
- # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
- node.attribute_nodes.each do |attr|
- attr_name = attr.name.downcase
- if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
- (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
- attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
- end
- end
-
- # Add required attributes.
- if @add_attributes.has_key?(name)
- @add_attributes[name].each {|key, val| node[key] = val }
- end
+ # Add required attributes.
+ if @add_attributes.include?(name)
+ @add_attributes[name].each {|key, val| node[key] = val }
end
end
-end; end
+end; end; end