lib/sanitize/transformers/clean_element.rb in sanitize-4.6.6 vs lib/sanitize/transformers/clean_element.rb in sanitize-5.0.0
- old
+ new
@@ -65,11 +65,11 @@
end
else
@whitespace_elements = config[:whitespace_elements]
end
- if config[:remove_contents].is_a?(Set)
+ if config[:remove_contents].is_a?(Enumerable)
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
else
@remove_all_contents = !!config[:remove_contents]
end
end
@@ -95,12 +95,14 @@
unless node.children.empty?
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
end
end
- unless @remove_all_contents || @remove_element_contents.include?(name)
- node.add_previous_sibling(node.children)
+ unless node.children.empty?
+ unless @remove_all_contents || @remove_element_contents.include?(name)
+ node.add_previous_sibling(node.children)
+ end
end
node.unlink
return
end
@@ -164,10 +166,15 @@
#
# Sanitize works around this by implementing its own escaping for
# affected attributes, some of which can exist on any element and some
# of which can only exist on `<a>` elements.
#
+ # This fix is technically no longer necessary with Nokogumbo >= 2.0
+ # since it no longer uses libxml2's serializer, but it's retained to
+ # avoid breaking use cases where people might be sanitizing individual
+ # Nokogiri nodes and then serializing them manually without Nokogumbo.
+ #
# The relevant libxml2 code is here:
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
(name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
@@ -177,9 +184,43 @@
end
# Add required attributes.
if @add_attributes.include?(name)
@add_attributes[name].each {|key, val| node[key] = val }
+ end
+
+ # Element-specific special cases.
+ case name
+
+ # If this is a whitelisted iframe that has children, remove all its
+ # children. The HTML standard says iframes shouldn't have content, but when
+ # they do, this content is parsed as text and is serialized verbatim without
+ # being escaped, which is unsafe because legacy browsers may still render it
+ # and execute `<script>` content. So the safe and correct thing to do is to
+ # always remove iframe content.
+ when 'iframe'
+ if !node.children.empty?
+ node.children.each do |child|
+ child.unlink
+ end
+ end
+
+ # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
+ # since Sanitize's output is always UTF-8.
+ when 'meta'
+ if node.has_attribute?('charset') &&
+ node['charset'].downcase != 'utf-8'
+
+ node['charset'] = 'utf-8'
+ end
+
+ if node.has_attribute?('http-equiv') &&
+ node.has_attribute?('content') &&
+ node['http-equiv'].downcase == 'content-type' &&
+ node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
+
+ node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
+ end
end
end
end; end; end