lib/sanitize.rb in sanitize-2.0.6 vs lib/sanitize.rb in sanitize-2.1.0

- old
+ new

@@ -34,16 +34,30 @@ require 'sanitize/transformers/clean_element' class Sanitize attr_reader :config + # Matches a valid HTML5 data attribute name. The unicode ranges included here + # are a conservative subset of the full range of characters that are + # technically allowed, with the intent of matching the most common characters + # used in data attribute names while excluding uncommon or potentially + # misleading characters, or characters with the potential to be normalized + # into unsafe or confusing forms. + # + # If you need data attr names with characters that aren't included here (such + # as combining marks, full-width characters, or CJK), please consider creating + # a custom transformer to validate attributes according to your needs. + # + # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes + REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u + # Matches an attribute value that could be treated by a browser as a URL # with a protocol prefix, such as "http:" or "javascript:". Any string of zero # or more characters followed by a colon is considered a match, even if the # colon is encoded as an entity and even if it's an incomplete entity (which # IE6 and Opera will still parse). - REGEX_PROTOCOL = /\A([^\/]*?)(?:\:|&#0*58|&#x0*3a)/i + REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i #-- # Class Methods #++ @@ -97,11 +111,11 @@ @transformers[:depth] << Transformers::CleanCDATA << Transformers::CleanElement.new(@config) end - # Returns a sanitized copy of _html_. + # Returns a sanitized copy of the given _html_ fragment. def clean(html) if html dupe = html.dup clean!(dupe) || dupe end @@ -127,15 +141,18 @@ result = output_method.call(output_method_params) return result == html ? nil : html[0, html.length] = result end + # Returns a sanitized copy of the given full _html_ document. def clean_document(html) unless html.nil? clean_document!(html.dup) || html end end + # Performs clean_document in place, returning _html_, or +nil+ if no changes + # were made. def clean_document!(html) if !@config[:elements].include?('html') && !@config[:remove_contents] raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true' # otherwise Nokogiri will raise for having multiple root nodes when # it moves its children to the root document context