lib/sanitize.rb in sanitize-2.1.1 vs lib/sanitize.rb in sanitize-3.0.0
- old
+ new
@@ -1,215 +1,260 @@
# encoding: utf-8
-#--
-# Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the 'Software'), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#++
+require 'nokogumbo'
require 'set'
-require 'nokogiri'
-require 'sanitize/version'
-require 'sanitize/config'
-require 'sanitize/config/restricted'
-require 'sanitize/config/basic'
-require 'sanitize/config/relaxed'
-require 'sanitize/transformers/clean_cdata'
-require 'sanitize/transformers/clean_comment'
-require 'sanitize/transformers/clean_element'
+require_relative 'sanitize/version'
+require_relative 'sanitize/config'
+require_relative 'sanitize/config/default'
+require_relative 'sanitize/config/restricted'
+require_relative 'sanitize/config/basic'
+require_relative 'sanitize/config/relaxed'
+require_relative 'sanitize/css'
+require_relative 'sanitize/transformers/clean_cdata'
+require_relative 'sanitize/transformers/clean_comment'
+require_relative 'sanitize/transformers/clean_css'
+require_relative 'sanitize/transformers/clean_doctype'
+require_relative 'sanitize/transformers/clean_element'
class Sanitize
attr_reader :config
- # Matches a valid HTML5 data attribute name. The unicode ranges included here
- # are a conservative subset of the full range of characters that are
- # technically allowed, with the intent of matching the most common characters
- # used in data attribute names while excluding uncommon or potentially
- # misleading characters, or characters with the potential to be normalized
- # into unsafe or confusing forms.
- #
- # If you need data attr names with characters that aren't included here (such
- # as combining marks, full-width characters, or CJK), please consider creating
- # a custom transformer to validate attributes according to your needs.
- #
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
- REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
-
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|�*58|�*3a)/i
+ # Matches Unicode characters that should be stripped from HTML before passing
+ # it to the parser.
+ #
+ # http://www.w3.org/TR/unicode-xml/#Charlist
+ REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+
#--
# Class Methods
#++
- # Returns a sanitized copy of _html_, using the settings in _config_ if
- # specified.
- def self.clean(html, config = {})
- Sanitize.new(config).clean(html)
+ # Returns a sanitized copy of the given full _html_ document, using the
+ # settings in _config_ if specified.
+ #
+ # When sanitizing a document, the `<html>` element must be whitelisted or an
+ # error will be raised. If this is undesirable, you should probably use
+ # {#fragment} instead.
+ def self.document(html, config = {})
+ Sanitize.new(config).document(html)
end
- # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
- # were made.
- def self.clean!(html, config = {})
- Sanitize.new(config).clean!(html)
+ # Returns a sanitized copy of the given _html_ fragment, using the settings in
+ # _config_ if specified.
+ def self.fragment(html, config = {})
+ Sanitize.new(config).fragment(html)
end
- # Performs a Sanitize#clean using a full-document HTML parser instead of
- # the default fragment parser. This will add a DOCTYPE and html tag
- # unless they are already present
- def self.clean_document(html, config = {})
- Sanitize.new(config).clean_document(html)
+ # Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
+ def self.node!(node, config = {})
+ Sanitize.new(config).node!(node)
end
- # Performs Sanitize#clean_document in place, returning _html_, or +nil+ if no
- # changes were made.
- def self.clean_document!(html, config = {})
- Sanitize.new(config).clean_document!(html)
- end
+ # Aliases for pre-3.0.0 backcompat.
+ class << Sanitize
+ # @deprecated Use {.document} instead.
+ alias_method :clean_document, :document
- # Sanitizes the specified Nokogiri::XML::Node and all its children.
- def self.clean_node!(node, config = {})
- Sanitize.new(config).clean_node!(node)
+ # @deprecated Use {.fragment} instead.
+ alias_method :clean, :fragment
+
+ # @deprecated Use {.node!} instead.
+ alias_method :clean_node!, :node!
end
#--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
- @config = Config::DEFAULT.merge(config)
+ @config = Config.merge(Config::DEFAULT, config)
- @transformers = {
- :breadth => Array(@config[:transformers_breadth].dup),
- :depth => Array(@config[:transformers]) + Array(@config[:transformers_depth])
- }
+ @transformers = Array(@config[:transformers].dup)
- # Default depth transformers. These always run at the end of the chain,
- # after any custom transformers.
- @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]
+ # Default transformers always run at the end of the chain, after any custom
+ # transformers.
+ @transformers << Transformers::CleanComment unless @config[:allow_comments]
+ @transformers << Transformers::CleanDoctype unless @config[:allow_doctype]
- @transformers[:depth] <<
+ if @config[:elements].include?('style')
+ scss = Sanitize::CSS.new(config)
+ @transformers << Transformers::CSS::CleanElement.new(scss)
+ end
+
+ if @config[:attributes].values.any? {|attr| attr.include?('style') }
+ scss ||= Sanitize::CSS.new(config)
+ @transformers << Transformers::CSS::CleanAttribute.new(scss)
+ end
+
+ @transformers <<
Transformers::CleanCDATA <<
Transformers::CleanElement.new(@config)
end
- # Returns a sanitized copy of the given _html_ fragment.
- def clean(html)
- if html
- dupe = html.dup
- clean!(dupe) || dupe
- end
+ # Returns a sanitized copy of the given _html_ document.
+ #
+ # When sanitizing a document, the `<html>` element must be whitelisted or an
+ # error will be raised. If this is undesirable, you should probably use
+ # {#fragment} instead.
+ def document(html)
+ return '' unless html
+
+ doc = Nokogiri::HTML5.parse(preprocess(html))
+ node!(doc)
+ to_html(doc)
end
- # Performs clean in place, returning _html_, or +nil+ if no changes were
- # made.
- def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
- fragment = parser.parse(html)
- clean_node!(fragment)
+ # @deprecated Use {#document} instead.
+ alias_method :clean_document, :document
- output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
+ # Returns a sanitized copy of the given _html_ fragment.
+ def fragment(html)
+ return '' unless html
- if @config[:output] == :xhtml
- output_method = fragment.method(:to_xhtml)
- output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
- elsif @config[:output] == :html
- output_method = fragment.method(:to_html)
+ html = preprocess(html)
+ doc = Nokogiri::HTML5.parse("<html><body>#{html}")
+
+ # Hack to allow fragments containing <body>. Borrowed from
+ # Nokogiri::HTML::DocumentFragment.
+ if html =~ /\A<body(?:\s|>)/i
+ path = '/html/body'
else
- raise Error, "unsupported output format: #{@config[:output]}"
+ path = '/html/body/node()'
end
- result = output_method.call(output_method_params)
+ frag = doc.fragment
+ doc.xpath(path).each {|node| frag << node }
- return result == html ? nil : html[0, html.length] = result
+ node!(frag)
+ to_html(frag)
end
- # Returns a sanitized copy of the given full _html_ document.
- def clean_document(html)
- unless html.nil?
- clean_document!(html.dup) || html
+ # @deprecated Use {#fragment} instead.
+ alias_method :clean, :fragment
+
+ # Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
+ # in place.
+ #
+ # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
+ # whitelisted or an error will be raised.
+ def node!(node)
+ raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
+
+ if node.is_a?(Nokogiri::XML::Document)
+ unless @config[:elements].include?('html')
+ raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+ end
end
- end
- # Performs clean_document in place, returning _html_, or +nil+ if no changes
- # were made.
- def clean_document!(html)
- if !@config[:elements].include?('html') && !@config[:remove_contents]
- raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
- # otherwise Nokogiri will raise for having multiple root nodes when
- # it moves its children to the root document context
+ node_whitelist = Set.new
+
+ traverse(node) do |n|
+ transform_node!(n, node_whitelist)
end
- clean!(html, Nokogiri::HTML::Document)
+ node
end
- # Sanitizes the specified Nokogiri::XML::Node and all its children.
- def clean_node!(node)
- raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
+ # @deprecated Use {#node!} instead.
+ alias_method :clean_node!, :node!
- node_whitelist = Set.new
+ private
- unless @transformers[:breadth].empty?
- traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
+ # Preprocesses HTML before parsing to remove undesirable Unicode chars.
+ def preprocess(html)
+ html.to_s.dup
+
+ unless html.encoding.name == 'UTF-8'
+ html.encode!('UTF-8',
+ :invalid => :replace,
+ :undef => :replace)
end
- traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
- node
+ html.gsub!(REGEX_UNSUITABLE_CHARS, '')
+ html
end
- private
+ def to_html(node)
+ replace_meta = false
- def transform_node!(node, node_whitelist, mode)
- @transformers[mode].each do |transformer|
- result = transformer.call({
+ # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
+ # meta tag to all serialized HTML documents.
+ #
+ # https://github.com/sparklemotion/nokogiri/issues/1008
+ if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
+ node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
+
+ regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
+
+ # Only replace the content-type meta tag if <meta> isn't whitelisted or
+ # the original document didn't actually include a content-type meta tag.
+ replace_meta = !@config[:elements].include?('meta') ||
+ node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
+ meta['http-equiv'].downcase == 'content-type'
+ end
+ end
+
+ so = Nokogiri::XML::Node::SaveOptions
+
+ # Serialize to HTML without any formatting to prevent Nokogiri from adding
+ # newlines after certain tags.
+ html = node.to_html(
+ :encoding => 'utf-8',
+ :indent => 0,
+ :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
+ )
+
+ html.gsub!(regex_meta, '\1') if replace_meta
+ html
+ end
+
+ def transform_node!(node, node_whitelist)
+ @transformers.each do |transformer|
+ result = transformer.call(
:config => @config,
:is_whitelisted => node_whitelist.include?(node),
:node => node,
:node_name => node.name.downcase,
- :node_whitelist => node_whitelist,
- :traversal_mode => mode
- })
+ :node_whitelist => node_whitelist
+ )
if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
node_whitelist.merge(result[:node_whitelist])
end
end
node
end
- # Performs breadth-first traversal, operating first on the root node, then
- # traversing downwards.
- def traverse_breadth(node, &block)
+ # Performs top-down traversal of the given node, operating first on the node
+ # itself, then traversing each child (if any) in order.
+ def traverse(node, &block)
block.call(node)
- node.children.each {|child| traverse_breadth(child, &block) }
- end
- # Performs depth-first traversal, operating first on the deepest nodes in the
- # document, then traversing upwards to the root.
- def traverse_depth(node, &block)
- node.children.each {|child| traverse_depth(child, &block) }
- block.call(node)
+ child = node.child
+
+ while child do
+ prev = child.previous_sibling
+ traverse(child, &block)
+
+ if child.parent != node
+ # The child was unlinked or reparented, so traverse the previous node's
+ # next sibling, or the parent's first child if there is no previous
+ # node.
+ child = prev ? prev.next_sibling : node.child
+ else
+ child = child.next_sibling
+ end
+ end
end
class Error < StandardError; end
end