lib/loofah/html5/scrub.rb in loofah-2.20.0 vs lib/loofah/html5/scrub.rb in loofah-2.21.0.rc1
- old
+ new
@@ -1,16 +1,17 @@
# frozen_string_literal: true
+
require "cgi"
require "crass"
module Loofah
module HTML5 # :nodoc:
module Scrub
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
- CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
- CSS_IMPORTANT = '!important'
+ CSS_IMPORTANT = "!important"
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
class << self
def allowed_element?(element_name)
@@ -24,11 +25,11 @@
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
else
attr_node.node_name
end
- if attr_name =~ DATA_ATTRIBUTE_NAME
+ if DATA_ATTRIBUTE_NAME.match?(attr_name)
next
end
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
attr_node.remove
@@ -41,14 +42,16 @@
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
scrub_attribute_that_allows_local_ref(attr_node)
end
- if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
- attr_node.remove
- next
- end
+ next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
+ attr_name == "xlink:href" &&
+ attr_node.value =~ /^\s*[^#\s].*/m
+
+ attr_node.remove
+ next
end
scrub_css_attribute(node)
node.attribute_nodes.each do |attr_node|
@@ -64,51 +67,51 @@
style = node.attributes["style"]
style.value = scrub_css(style.value) if style
end
def scrub_css(style)
+ url_flags = [:url, :bad_url]
style_tree = Crass.parse_properties(style)
sanitized_tree = []
style_tree.each do |node|
next unless node[:node] == :property
next if node[:children].any? do |child|
- [:url, :bad_url].include?(child[:node])
+ url_flags.include?(child[:node])
end
name = node[:name].downcase
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
- SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
- SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
value = node[:children].map do |child|
case child[:node]
when :whitespace
nil
when :string
- if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
+ if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
Crass::Parser.stringify(child)
- else
- nil
end
when :function
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
Crass::Parser.stringify(child)
end
when :ident
keyword = child[:value]
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
- SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
- (keyword =~ CSS_KEYWORDISH)
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
+ (keyword =~ CSS_KEYWORDISH)
keyword
end
else
child[:raw]
end
end.compact
next if value.empty?
+
value << CSS_IMPORTANT if node[:important]
propstring = format("%s:%s", name, value.join(" "))
sanitized_node = Crass.parse_properties(propstring).first
sanitized_tree << sanitized_node << CRASS_SEMICOLON
end
@@ -124,27 +127,24 @@
values = nodes.map do |node|
case node[:node]
when :url
if node[:value].start_with?("#")
node[:raw]
- else
- nil
end
when :hash, :ident, :string
node[:raw]
- else
- nil
end
end.compact
attr_node.value = values.join(" ")
end
def scrub_uri_attribute(attr_node)
# this block lifted nearly verbatim from HTML5 sanitization
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
+ !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
attr_node.remove
return true
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
# permit only allowed data mediatypes
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
@@ -182,12 +182,12 @@
end.force_encoding(encoding)
end
end
def cdata_needs_escaping?(node)
- # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
- node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
+ node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
end
def cdata_escape(node)
escaped_text = escape_tags(node.text)
if Nokogiri.jruby?
@@ -196,31 +196,31 @@
node.document.create_cdata(escaped_text)
end
end
TABLE_FOR_ESCAPE_HTML__ = {
- '<' => '<',
- '>' => '>',
- '&' => '&',
+ "<" => "<",
+ ">" => ">",
+ "&" => "&",
}
def escape_tags(string)
# modified version of CGI.escapeHTML from ruby 3.1
enc = string.encoding
- unless enc.ascii_compatible?
+ if enc.ascii_compatible?
+ string = string.b
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+ string.force_encoding(enc)
+ else
if enc.dummy?
origenc = enc
enc = Encoding::Converter.asciicompat_encoding(enc)
string = enc ? string.encode(enc) : string.b
end
- table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
string.encode!(origenc) if origenc
string
- else
- string = string.b
- string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
- string.force_encoding(enc)
end
end
end
end
end