lib/loofah/html5/scrub.rb in loofah-2.19.0 vs lib/loofah/html5/scrub.rb in loofah-2.19.1
- old
+ new
@@ -34,28 +34,17 @@
attr_node.remove
next
end
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
- # this block lifted nearly verbatim from HTML5 sanitization
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
- attr_node.remove
- next
- elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
- # permit only allowed data mediatypes
- mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
- mediatype, _ = mediatype.split(";")[0..1] if mediatype
- if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
- attr_node.remove
- next
- end
- end
+ next if scrub_uri_attribute(attr_node)
end
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, " ") if attr_node.value
+ scrub_attribute_that_allows_local_ref(attr_node)
end
+
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
attr_node.remove
next
end
end
@@ -125,10 +114,51 @@
end
Crass::Parser.stringify(sanitized_tree)
end
+ def scrub_attribute_that_allows_local_ref(attr_node)
+ return unless attr_node.value
+
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
+
+ values = nodes.map do |node|
+ case node[:node]
+ when :url
+ if node[:value].start_with?("#")
+ node[:raw]
+ else
+ nil
+ end
+ when :hash, :ident, :string
+ node[:raw]
+ else
+ nil
+ end
+ end.compact
+
+ attr_node.value = values.join(" ")
+ end
+
+ def scrub_uri_attribute(attr_node)
+ # this block lifted nearly verbatim from HTML5 sanitization
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
+ attr_node.remove
+ return true
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
+ # permit only allowed data mediatypes
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
+ attr_node.remove
+ return true
+ end
+ end
+ false
+ end
+
#
# libxml2 >= 2.9.2 fails to escape comments within some attributes.
#
# see comments about CVE-2018-8048 within the tests for more information
#
@@ -148,9 +178,49 @@
#
encoding = attr_node.value.encoding
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
"%" + m.unpack("H2" * m.bytesize).join("%").upcase
end.force_encoding(encoding)
+ end
+ end
+
+ def cdata_needs_escaping?(node)
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
+ node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
+ end
+
+ def cdata_escape(node)
+ escaped_text = escape_tags(node.text)
+ if Nokogiri.jruby?
+ node.document.create_text_node(escaped_text)
+ else
+ node.document.create_cdata(escaped_text)
+ end
+ end
+
+ TABLE_FOR_ESCAPE_HTML__ = {
+ '<' => '<',
+ '>' => '>',
+ '&' => '&',
+ }
+
+ def escape_tags(string)
+ # modified version of CGI.escapeHTML from ruby 3.1
+ enc = string.encoding
+ unless enc.ascii_compatible?
+ if enc.dummy?
+ origenc = enc
+ enc = Encoding::Converter.asciicompat_encoding(enc)
+ string = enc ? string.encode(enc) : string.b
+ end
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
+ string.encode!(origenc) if origenc
+ string
+ else
+ string = string.b
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+ string.force_encoding(enc)
end
end
end
end
end