lib/sanitize/transformers/clean_element.rb in sanitize-4.6.6 vs lib/sanitize/transformers/clean_element.rb in sanitize-5.0.0

- old
+ new

@@ -65,11 +65,11 @@ end else @whitespace_elements = config[:whitespace_elements] end - if config[:remove_contents].is_a?(Set) + if config[:remove_contents].is_a?(Enumerable) @remove_element_contents.merge(config[:remove_contents].map(&:to_s)) else @remove_all_contents = !!config[:remove_contents] end end @@ -95,12 +95,14 @@ unless node.children.empty? node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document)) end end - unless @remove_all_contents || @remove_element_contents.include?(name) - node.add_previous_sibling(node.children) + unless node.children.empty? + unless @remove_all_contents || @remove_element_contents.include?(name) + node.add_previous_sibling(node.children) + end end node.unlink return end @@ -164,10 +166,15 @@ # # Sanitize works around this by implementing its own escaping for # affected attributes, some of which can exist on any element and some # of which can only exist on `<a>` elements. # + # This fix is technically no longer necessary with Nokogumbo >= 2.0 + # since it no longer uses libxml2's serializer, but it's retained to + # avoid breaking use cases where people might be sanitizing individual + # Nokogiri nodes and then serializing them manually without Nokogumbo. + # # The relevant libxml2 code is here: # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588> if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) || (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name)) @@ -177,9 +184,43 @@ end # Add required attributes. if @add_attributes.include?(name) @add_attributes[name].each {|key, val| node[key] = val } + end + + # Element-specific special cases. + case name + + # If this is a whitelisted iframe that has children, remove all its + # children. The HTML standard says iframes shouldn't have content, but when + # they do, this content is parsed as text and is serialized verbatim without + # being escaped, which is unsafe because legacy browsers may still render it + # and execute `<script>` content. So the safe and correct thing to do is to + # always remove iframe content. + when 'iframe' + if !node.children.empty? + node.children.each do |child| + child.unlink + end + end + + # Prevent the use of `<meta>` elements that set a charset other than UTF-8, + # since Sanitize's output is always UTF-8. + when 'meta' + if node.has_attribute?('charset') && + node['charset'].downcase != 'utf-8' + + node['charset'] = 'utf-8' + end + + if node.has_attribute?('http-equiv') && + node.has_attribute?('content') && + node['http-equiv'].downcase == 'content-type' && + node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/ + + node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8') + end end end end; end; end