clean_element.rb in sanitize-5.0.0

- old
+ new

@@ -65,11 +65,11 @@
       end
     else
       @whitespace_elements = config[:whitespace_elements]
     end
 
-    if config[:remove_contents].is_a?(Set)
+    if config[:remove_contents].is_a?(Enumerable)
       @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
     else
       @remove_all_contents = !!config[:remove_contents]
     end
   end
@@ -95,12 +95,14 @@
         unless node.children.empty?
           node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
         end
       end
 
-      unless @remove_all_contents || @remove_element_contents.include?(name)
-        node.add_previous_sibling(node.children)
+      unless node.children.empty?
+        unless @remove_all_contents || @remove_element_contents.include?(name)
+          node.add_previous_sibling(node.children)
+        end
       end
 
       node.unlink
       return
     end
@@ -164,10 +166,15 @@
         #
         # Sanitize works around this by implementing its own escaping for
         # affected attributes, some of which can exist on any element and some
         # of which can only exist on `<a>` elements.
         #
+        # This fix is technically no longer necessary with Nokogumbo >= 2.0
+        # since it no longer uses libxml2's serializer, but it's retained to
+        # avoid breaking use cases where people might be sanitizing individual
+        # Nokogiri nodes and then serializing them manually without Nokogumbo.
+        #
         # The relevant libxml2 code is here:
         # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
         if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
             (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
 
@@ -177,9 +184,43 @@
     end
 
     # Add required attributes.
     if @add_attributes.include?(name)
       @add_attributes[name].each {|key, val| node[key] = val }
+    end
+
+    # Element-specific special cases.
+    case name
+
+    # If this is a whitelisted iframe that has children, remove all its
+    # children. The HTML standard says iframes shouldn't have content, but when
+    # they do, this content is parsed as text and is serialized verbatim without
+    # being escaped, which is unsafe because legacy browsers may still render it
+    # and execute `<script>` content. So the safe and correct thing to do is to
+    # always remove iframe content.
+    when 'iframe'
+      if !node.children.empty?
+        node.children.each do |child|
+          child.unlink
+        end
+      end
+
+    # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
+    # since Sanitize's output is always UTF-8.
+    when 'meta'
+      if node.has_attribute?('charset') &&
+          node['charset'].downcase != 'utf-8'
+
+        node['charset'] = 'utf-8'
+      end
+
+      if node.has_attribute?('http-equiv') &&
+          node.has_attribute?('content') &&
+          node['http-equiv'].downcase == 'content-type' &&
+          node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
+
+        node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
+      end
     end
   end
 
 end; end; end