lib/loofah/html5/scrub.rb in loofah-2.19.0 vs lib/loofah/html5/scrub.rb in loofah-2.19.1

- old
+ new

@@ -34,28 +34,17 @@ attr_node.remove next end if SafeList::ATTR_VAL_IS_URI.include?(attr_name) - # this block lifted nearly verbatim from HTML5 sanitization - val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase - if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0]) - attr_node.remove - next - elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data" - # permit only allowed data mediatypes - mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1] - mediatype, _ = mediatype.split(";")[0..1] if mediatype - if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype) - attr_node.remove - next - end - end + next if scrub_uri_attribute(attr_node) end + if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name) - attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, " ") if attr_node.value + scrub_attribute_that_allows_local_ref(attr_node) end + if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m attr_node.remove next end end @@ -125,10 +114,51 @@ end Crass::Parser.stringify(sanitized_tree) end + def scrub_attribute_that_allows_local_ref(attr_node) + return unless attr_node.value + + nodes = Crass::Parser.new(attr_node.value).parse_component_values + + values = nodes.map do |node| + case node[:node] + when :url + if node[:value].start_with?("#") + node[:raw] + else + nil + end + when :hash, :ident, :string + node[:raw] + else + nil + end + end.compact + + attr_node.value = values.join(" ") + end + + def scrub_uri_attribute(attr_node) + # this block lifted nearly verbatim from HTML5 sanitization + val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase + if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0]) + attr_node.remove + return true + elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data" + # permit only allowed data mediatypes + mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1] + mediatype, _ = mediatype.split(";")[0..1] if mediatype + if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype) + attr_node.remove + return true + end + end + false + end + # # libxml2 >= 2.9.2 fails to escape comments within some attributes. # # see comments about CVE-2018-8048 within the tests for more information # @@ -148,9 +178,49 @@ # encoding = attr_node.value.encoding attr_node.value = attr_node.value.gsub(/[ "]/) do |m| "%" + m.unpack("H2" * m.bytesize).join("%").upcase end.force_encoding(encoding) + end + end + + def cdata_needs_escaping?(node) + # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way + node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script")) + end + + def cdata_escape(node) + escaped_text = escape_tags(node.text) + if Nokogiri.jruby? + node.document.create_text_node(escaped_text) + else + node.document.create_cdata(escaped_text) + end + end + + TABLE_FOR_ESCAPE_HTML__ = { + '<' => '&lt;', + '>' => '&gt;', + '&' => '&amp;', + } + + def escape_tags(string) + # modified version of CGI.escapeHTML from ruby 3.1 + enc = string.encoding + unless enc.ascii_compatible? + if enc.dummy? + origenc = enc + enc = Encoding::Converter.asciicompat_encoding(enc) + string = enc ? string.encode(enc) : string.b + end + table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}] + string = string.gsub(/#{"[<>&]".encode(enc)}/, table) + string.encode!(origenc) if origenc + string + else + string = string.b + string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__) + string.force_encoding(enc) end end end end end