sanitize.rb in sanitize-5.1.0

- old
+ new

@@ -17,22 +17,37 @@
 require_relative 'sanitize/transformers/clean_element'
 
 class Sanitize
   attr_reader :config
 
+  # Matches one or more control characters that should be removed from HTML
+  # before parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#control
+  REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
+
+  # Matches one or more non-characters that should be removed from HTML before
+  # parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#noncharacter
+  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
+
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
   # colon is encoded as an entity and even if it's an incomplete entity (which
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
 
-  # Matches Unicode characters that should be stripped from HTML before passing
-  # it to the parser.
+  # Matches one or more characters that should be stripped from HTML before
+  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
+  # `REGEX_HTML_NON_CHARACTERS`.
   #
-  # http://www.w3.org/TR/unicode-xml/#Charlist
-  REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+  # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
 
   #--
   # Class Methods
   #++
 
@@ -106,11 +121,11 @@
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def document(html)
     return '' unless html
 
-    doc = Nokogiri::HTML5.parse(preprocess(html))
+    doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
     node!(doc)
     to_html(doc)
   end
 
   # @deprecated Use {#document} instead.
@@ -118,11 +133,10 @@
 
   # Returns a sanitized copy of the given _html_ fragment.
   def fragment(html)
     return '' unless html
 
-    html = preprocess(html)
-    frag  = Nokogiri::HTML5.fragment(html)
+    frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
     node!(frag)
     to_html(frag)
   end
 
   # @deprecated Use {#fragment} instead.