lib/sanitize.rb in sanitize-5.0.0 vs lib/sanitize.rb in sanitize-5.1.0
- old
+ new
@@ -17,22 +17,37 @@
require_relative 'sanitize/transformers/clean_element'
class Sanitize
attr_reader :config
+ # Matches one or more control characters that should be removed from HTML
+ # before parsing, as defined by the HTML living standard.
+ #
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+ # - https://infra.spec.whatwg.org/#control
+ REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
+
+ # Matches one or more non-characters that should be removed from HTML before
+ # parsing, as defined by the HTML living standard.
+ #
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+ # - https://infra.spec.whatwg.org/#noncharacter
+ REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
+
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|�*58|�*3a)/i
- # Matches Unicode characters that should be stripped from HTML before passing
- # it to the parser.
+ # Matches one or more characters that should be stripped from HTML before
+ # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
+ # `REGEX_HTML_NON_CHARACTERS`.
#
- # http://www.w3.org/TR/unicode-xml/#Charlist
- REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+ # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+ REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
#--
# Class Methods
#++
@@ -106,11 +121,11 @@
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def document(html)
return '' unless html
- doc = Nokogiri::HTML5.parse(preprocess(html))
+ doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
node!(doc)
to_html(doc)
end
# @deprecated Use {#document} instead.
@@ -118,11 +133,10 @@
# Returns a sanitized copy of the given _html_ fragment.
def fragment(html)
return '' unless html
- html = preprocess(html)
- frag = Nokogiri::HTML5.fragment(html)
+ frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
node!(frag)
to_html(frag)
end
# @deprecated Use {#fragment} instead.