sanitize.rb in sanitize-2.1.0

- old
+ new

@@ -34,16 +34,30 @@
 require 'sanitize/transformers/clean_element'
 
 class Sanitize
   attr_reader :config
 
+  # Matches a valid HTML5 data attribute name. The unicode ranges included here
+  # are a conservative subset of the full range of characters that are
+  # technically allowed, with the intent of matching the most common characters
+  # used in data attribute names while excluding uncommon or potentially
+  # misleading characters, or characters with the potential to be normalized
+  # into unsafe or confusing forms.
+  #
+  # If you need data attr names with characters that aren't included here (such
+  # as combining marks, full-width characters, or CJK), please consider creating
+  # a custom transformer to validate attributes according to your needs.
+  #
+  # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
+  REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
+
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
   # colon is encoded as an entity and even if it's an incomplete entity (which
   # IE6 and Opera will still parse).
-  REGEX_PROTOCOL = /\A([^\/]*?)(?:\:|&#0*58|&#x0*3a)/i
+  REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
 
   #--
   # Class Methods
   #++
 
@@ -97,11 +111,11 @@
     @transformers[:depth] <<
         Transformers::CleanCDATA <<
         Transformers::CleanElement.new(@config)
   end
 
-  # Returns a sanitized copy of _html_.
+  # Returns a sanitized copy of the given _html_ fragment.
   def clean(html)
     if html
       dupe = html.dup
       clean!(dupe) || dupe
     end
@@ -127,15 +141,18 @@
     result = output_method.call(output_method_params)
 
     return result == html ? nil : html[0, html.length] = result
   end
 
+  # Returns a sanitized copy of the given full _html_ document.
   def clean_document(html)
     unless html.nil?
       clean_document!(html.dup) || html
     end
   end
 
+  # Performs clean_document in place, returning _html_, or +nil+ if no changes
+  # were made.
   def clean_document!(html)
     if !@config[:elements].include?('html') && !@config[:remove_contents]
       raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
       # otherwise Nokogiri will raise for having multiple root nodes when
       # it moves its children to the root document context