lib/sanitize.rb in sanitize-2.0.6 vs lib/sanitize.rb in sanitize-2.1.0
- old
+ new
@@ -34,16 +34,30 @@
require 'sanitize/transformers/clean_element'
class Sanitize
attr_reader :config
+ # Matches a valid HTML5 data attribute name. The unicode ranges included here
+ # are a conservative subset of the full range of characters that are
+ # technically allowed, with the intent of matching the most common characters
+ # used in data attribute names while excluding uncommon or potentially
+ # misleading characters, or characters with the potential to be normalized
+ # into unsafe or confusing forms.
+ #
+ # If you need data attr names with characters that aren't included here (such
+ # as combining marks, full-width characters, or CJK), please consider creating
+ # a custom transformer to validate attributes according to your needs.
+ #
+ # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
+ REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
+
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
- REGEX_PROTOCOL = /\A([^\/]*?)(?:\:|�*58|�*3a)/i
+ REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|�*58|�*3a)/i
#--
# Class Methods
#++
@@ -97,11 +111,11 @@
@transformers[:depth] <<
Transformers::CleanCDATA <<
Transformers::CleanElement.new(@config)
end
- # Returns a sanitized copy of _html_.
+ # Returns a sanitized copy of the given _html_ fragment.
def clean(html)
if html
dupe = html.dup
clean!(dupe) || dupe
end
@@ -127,15 +141,18 @@
result = output_method.call(output_method_params)
return result == html ? nil : html[0, html.length] = result
end
+ # Returns a sanitized copy of the given full _html_ document.
def clean_document(html)
unless html.nil?
clean_document!(html.dup) || html
end
end
+ # Performs clean_document in place, returning _html_, or +nil+ if no changes
+ # were made.
def clean_document!(html)
if !@config[:elements].include?('html') && !@config[:remove_contents]
raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
# otherwise Nokogiri will raise for having multiple root nodes when
# it moves its children to the root document context