lib/sanitize/transformers/clean_element.rb in sanitize-4.6.2 vs lib/sanitize/transformers/clean_element.rb in sanitize-4.6.3

- old
+ new

@@ -16,10 +16,35 @@ # a custom transformer to validate attributes according to your needs. # # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u + # Attributes that need additional escaping on `<a>` elements due to unsafe + # libxml2 behavior. + UNSAFE_LIBXML_ATTRS_A = Set.new(%w[ + name + ]) + + # Attributes that need additional escaping on all elements due to unsafe + # libxml2 behavior. + UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[ + action + href + src + ]) + + # Mapping of original characters to escape sequences for characters that + # should be escaped in attributes affected by unsafe libxml2 behavior. + UNSAFE_LIBXML_ESCAPE_CHARS = { + ' ' => '%20', + '"' => '%22' + } + + # Regex that matches any single character that needs to be escaped in + # attributes affected by unsafe libxml2 behavior. + UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/ + def initialize(config) @add_attributes = config[:add_attributes] @attributes = config[:attributes].dup @elements = config[:elements] @protocols = config[:protocols] @@ -90,34 +115,64 @@ # Delete any attribute that isn't allowed on this element. node.attribute_nodes.each do |attr| attr_name = attr.name.downcase - if attr_whitelist.include?(attr_name) - # The attribute is whitelisted. + unless attr_whitelist.include?(attr_name) + # The attribute isn't whitelisted. - # Remove any attributes that use unacceptable protocols. - if @protocols.include?(name) && @protocols[name].include?(attr_name) - attr_protocols = @protocols[name][attr_name] + if allow_data_attributes && attr_name.start_with?('data-') + # Arbitrary data attributes are allowed. If this is a data + # attribute, continue. + next if attr_name =~ REGEX_DATA_ATTR + end - if attr.value =~ REGEX_PROTOCOL - attr.unlink unless attr_protocols.include?($1.downcase) - else - attr.unlink unless attr_protocols.include?(:relative) + # Either the attribute isn't a data attribute or arbitrary data + # attributes aren't allowed. Remove the attribute. + attr.unlink + next + end + + # The attribute is whitelisted. + + # Remove any attributes that use unacceptable protocols. + if @protocols.include?(name) && @protocols[name].include?(attr_name) + attr_protocols = @protocols[name][attr_name] + + if attr.value =~ REGEX_PROTOCOL + unless attr_protocols.include?($1.downcase) + attr.unlink + next end - end - else - # The attribute isn't whitelisted. - if allow_data_attributes && attr_name.start_with?('data-') - # Arbitrary data attributes are allowed. Verify that the attribute - # is a valid data attribute. - attr.unlink unless attr_name =~ REGEX_DATA_ATTR else - # Either the attribute isn't a data attribute, or arbitrary data - # attributes aren't allowed. Remove the attribute. - attr.unlink + unless attr_protocols.include?(:relative) + attr.unlink + next + end end + + # Leading and trailing whitespace around URLs is ignored at parse + # time. Stripping it here prevents it from being escaped by the + # libxml2 workaround below. + attr.value = attr.value.strip + end + + # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an + # attempt to preserve server-side includes. This can result in XSS since + # an unescaped double quote can allow an attacker to inject a + # non-whitelisted attribute. + # + # Sanitize works around this by implementing its own escaping for + # affected attributes, some of which can exist on any element and some + # of which can only exist on `<a>` elements. + # + # The relevant libxml2 code is here: + # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588> + if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) || + (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name)) + + attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS) end end end # Add required attributes.