scrub.rb in loofah-2.19.1

- old
+ new

@@ -34,28 +34,17 @@
               attr_node.remove
               next
             end
 
             if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
-              # this block lifted nearly verbatim from HTML5 sanitization
-              val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
-              if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
-                attr_node.remove
-                next
-              elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
-                # permit only allowed data mediatypes
-                mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
-                mediatype, _ = mediatype.split(";")[0..1] if mediatype
-                if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
-                  attr_node.remove
-                  next
-                end
-              end
+              next if scrub_uri_attribute(attr_node)
             end
+
             if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
-              attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, " ") if attr_node.value
+              scrub_attribute_that_allows_local_ref(attr_node)
             end
+
             if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
               attr_node.remove
               next
             end
           end
@@ -125,10 +114,51 @@
           end
 
           Crass::Parser.stringify(sanitized_tree)
         end
 
+        def scrub_attribute_that_allows_local_ref(attr_node)
+          return unless attr_node.value
+
+          nodes = Crass::Parser.new(attr_node.value).parse_component_values
+
+          values = nodes.map do |node|
+            case node[:node]
+            when :url
+              if node[:value].start_with?("#")
+                node[:raw]
+              else
+                nil
+              end
+            when :hash, :ident, :string
+              node[:raw]
+            else
+              nil
+            end
+          end.compact
+
+          attr_node.value = values.join(" ")
+        end
+
+        def scrub_uri_attribute(attr_node)
+          # this block lifted nearly verbatim from HTML5 sanitization
+          val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
+          if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
+            attr_node.remove
+            return true
+          elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
+            # permit only allowed data mediatypes
+            mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
+            mediatype, _ = mediatype.split(";")[0..1] if mediatype
+            if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
+              attr_node.remove
+              return true
+            end
+          end
+          false
+        end
+
         #
         #  libxml2 >= 2.9.2 fails to escape comments within some attributes.
         #
         #  see comments about CVE-2018-8048 within the tests for more information
         #
@@ -148,9 +178,49 @@
             #
             encoding = attr_node.value.encoding
             attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
               "%" + m.unpack("H2" * m.bytesize).join("%").upcase
             end.force_encoding(encoding)
+          end
+        end
+
+        def cdata_needs_escaping?(node)
+          # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
+          node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
+        end
+
+        def cdata_escape(node)
+          escaped_text = escape_tags(node.text)
+          if Nokogiri.jruby?
+            node.document.create_text_node(escaped_text)
+          else
+            node.document.create_cdata(escaped_text)
+          end
+        end
+
+        TABLE_FOR_ESCAPE_HTML__ = {
+          '<' => '&lt;',
+          '>' => '&gt;',
+          '&' => '&amp;',
+        }
+
+        def escape_tags(string)
+          # modified version of CGI.escapeHTML from ruby 3.1
+          enc = string.encoding
+          unless enc.ascii_compatible?
+            if enc.dummy?
+              origenc = enc
+              enc = Encoding::Converter.asciicompat_encoding(enc)
+              string = enc ? string.encode(enc) : string.b
+            end
+            table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+            string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
+            string.encode!(origenc) if origenc
+            string
+          else
+            string = string.b
+            string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+            string.force_encoding(enc)
           end
         end
       end
     end
   end