sanitize.rb in sanitize-3.0.0

- old
+ new
@@ -1,215 +1,260 @@
 # encoding: utf-8
-#--
-# Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the 'Software'), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#++
 
+require 'nokogumbo'
 require 'set'
 
-require 'nokogiri'
-require 'sanitize/version'
-require 'sanitize/config'
-require 'sanitize/config/restricted'
-require 'sanitize/config/basic'
-require 'sanitize/config/relaxed'
-require 'sanitize/transformers/clean_cdata'
-require 'sanitize/transformers/clean_comment'
-require 'sanitize/transformers/clean_element'
+require_relative 'sanitize/version'
+require_relative 'sanitize/config'
+require_relative 'sanitize/config/default'
+require_relative 'sanitize/config/restricted'
+require_relative 'sanitize/config/basic'
+require_relative 'sanitize/config/relaxed'
+require_relative 'sanitize/css'
+require_relative 'sanitize/transformers/clean_cdata'
+require_relative 'sanitize/transformers/clean_comment'
+require_relative 'sanitize/transformers/clean_css'
+require_relative 'sanitize/transformers/clean_doctype'
+require_relative 'sanitize/transformers/clean_element'
 
 class Sanitize
   attr_reader :config
 
-  # Matches a valid HTML5 data attribute name. The unicode ranges included here
-  # are a conservative subset of the full range of characters that are
-  # technically allowed, with the intent of matching the most common characters
-  # used in data attribute names while excluding uncommon or potentially
-  # misleading characters, or characters with the potential to be normalized
-  # into unsafe or confusing forms.
-  #
-  # If you need data attr names with characters that aren't included here (such
-  # as combining marks, full-width characters, or CJK), please consider creating
-  # a custom transformer to validate attributes according to your needs.
-  #
-  # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
-  REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
-
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
   # colon is encoded as an entity and even if it's an incomplete entity (which
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
 
+  # Matches Unicode characters that should be stripped from HTML before passing
+  # it to the parser.
+  #
+  # http://www.w3.org/TR/unicode-xml/#Charlist
+  REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+
   #--
   # Class Methods
   #++
 
-  # Returns a sanitized copy of _html_, using the settings in _config_ if
-  # specified.
-  def self.clean(html, config = {})
-    Sanitize.new(config).clean(html)
+  # Returns a sanitized copy of the given full _html_ document, using the
+  # settings in _config_ if specified.
+  #
+  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # error will be raised. If this is undesirable, you should probably use
+  # {#fragment} instead.
+  def self.document(html, config = {})
+    Sanitize.new(config).document(html)
   end
 
-  # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
-  # were made.
-  def self.clean!(html, config = {})
-    Sanitize.new(config).clean!(html)
+  # Returns a sanitized copy of the given _html_ fragment, using the settings in
+  # _config_ if specified.
+  def self.fragment(html, config = {})
+    Sanitize.new(config).fragment(html)
   end
 
-  # Performs a Sanitize#clean using a full-document HTML parser instead of
-  # the default fragment parser. This will add a DOCTYPE and html tag
-  # unless they are already present
-  def self.clean_document(html, config = {})
-    Sanitize.new(config).clean_document(html)
+  # Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
+  def self.node!(node, config = {})
+    Sanitize.new(config).node!(node)
   end
 
-  # Performs Sanitize#clean_document in place, returning _html_, or +nil+ if no
-  # changes were made.
-  def self.clean_document!(html, config = {})
-    Sanitize.new(config).clean_document!(html)
-  end
+  # Aliases for pre-3.0.0 backcompat.
+  class << Sanitize
+    # @deprecated Use {.document} instead.
+    alias_method :clean_document, :document
 
-  # Sanitizes the specified Nokogiri::XML::Node and all its children.
-  def self.clean_node!(node, config = {})
-    Sanitize.new(config).clean_node!(node)
+    # @deprecated Use {.fragment} instead.
+    alias_method :clean, :fragment
+
+    # @deprecated Use {.node!} instead.
+    alias_method :clean_node!, :node!
   end
 
   #--
   # Instance Methods
   #++
 
   # Returns a new Sanitize object initialized with the settings in _config_.
   def initialize(config = {})
-    @config = Config::DEFAULT.merge(config)
+    @config = Config.merge(Config::DEFAULT, config)
 
-    @transformers = {
-      :breadth => Array(@config[:transformers_breadth].dup),
-      :depth   => Array(@config[:transformers]) + Array(@config[:transformers_depth])
-    }
+    @transformers = Array(@config[:transformers].dup)
 
-    # Default depth transformers. These always run at the end of the chain,
-    # after any custom transformers.
-    @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]
+    # Default transformers always run at the end of the chain, after any custom
+    # transformers.
+    @transformers << Transformers::CleanComment unless @config[:allow_comments]
+    @transformers << Transformers::CleanDoctype unless @config[:allow_doctype]
 
-    @transformers[:depth] <<
+    if @config[:elements].include?('style')
+      scss = Sanitize::CSS.new(config)
+      @transformers << Transformers::CSS::CleanElement.new(scss)
+    end
+
+    if @config[:attributes].values.any? {|attr| attr.include?('style') }
+      scss ||= Sanitize::CSS.new(config)
+      @transformers << Transformers::CSS::CleanAttribute.new(scss)
+    end
+
+    @transformers <<
         Transformers::CleanCDATA <<
         Transformers::CleanElement.new(@config)
   end
 
-  # Returns a sanitized copy of the given _html_ fragment.
-  def clean(html)
-    if html
-      dupe = html.dup
-      clean!(dupe) || dupe
-    end
+  # Returns a sanitized copy of the given _html_ document.
+  #
+  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # error will be raised. If this is undesirable, you should probably use
+  # {#fragment} instead.
+  def document(html)
+    return '' unless html
+
+    doc = Nokogiri::HTML5.parse(preprocess(html))
+    node!(doc)
+    to_html(doc)
   end
 
-  # Performs clean in place, returning _html_, or +nil+ if no changes were
-  # made.
-  def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
-    fragment = parser.parse(html)
-    clean_node!(fragment)
+  # @deprecated Use {#document} instead.
+  alias_method :clean_document, :document
 
-    output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
+  # Returns a sanitized copy of the given _html_ fragment.
+  def fragment(html)
+    return '' unless html
 
-    if @config[:output] == :xhtml
-      output_method = fragment.method(:to_xhtml)
-      output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
-    elsif @config[:output] == :html
-      output_method = fragment.method(:to_html)
+    html = preprocess(html)
+    doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
+
+    # Hack to allow fragments containing <body>. Borrowed from
+    # Nokogiri::HTML::DocumentFragment.
+    if html =~ /\A<body(?:\s|>)/i
+      path = '/html/body'
     else
-      raise Error, "unsupported output format: #{@config[:output]}"
+      path = '/html/body/node()'
     end
 
-    result = output_method.call(output_method_params)
+    frag = doc.fragment
+    doc.xpath(path).each {|node| frag << node }
 
-    return result == html ? nil : html[0, html.length] = result
+    node!(frag)
+    to_html(frag)
   end
 
-  # Returns a sanitized copy of the given full _html_ document.
-  def clean_document(html)
-    unless html.nil?
-      clean_document!(html.dup) || html
+  # @deprecated Use {#fragment} instead.
+  alias_method :clean, :fragment
+
+  # Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
+  # in place.
+  #
+  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
+  # whitelisted or an error will be raised.
+  def node!(node)
+    raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
+
+    if node.is_a?(Nokogiri::XML::Document)
+      unless @config[:elements].include?('html')
+        raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+      end
     end
-  end
 
-  # Performs clean_document in place, returning _html_, or +nil+ if no changes
-  # were made.
-  def clean_document!(html)
-    if !@config[:elements].include?('html') && !@config[:remove_contents]
-      raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
-      # otherwise Nokogiri will raise for having multiple root nodes when
-      # it moves its children to the root document context
+    node_whitelist = Set.new
+
+    traverse(node) do |n|
+      transform_node!(n, node_whitelist)
     end
 
-    clean!(html, Nokogiri::HTML::Document)
+    node
   end
 
-  # Sanitizes the specified Nokogiri::XML::Node and all its children.
-  def clean_node!(node)
-    raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
+  # @deprecated Use {#node!} instead.
+  alias_method :clean_node!, :node!
 
-    node_whitelist = Set.new
+  private
 
-    unless @transformers[:breadth].empty?
-      traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
+  # Preprocesses HTML before parsing to remove undesirable Unicode chars.
+  def preprocess(html)
+    html.to_s.dup
+
+    unless html.encoding.name == 'UTF-8'
+      html.encode!('UTF-8',
+        :invalid => :replace,
+        :undef   => :replace)
     end
 
-    traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
-    node
+    html.gsub!(REGEX_UNSUITABLE_CHARS, '')
+    html
   end
 
-  private
+  def to_html(node)
+    replace_meta = false
 
-  def transform_node!(node, node_whitelist, mode)
-    @transformers[mode].each do |transformer|
-      result = transformer.call({
+    # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
+    # meta tag to all serialized HTML documents.
+    #
+    # https://github.com/sparklemotion/nokogiri/issues/1008
+    if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
+        node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
+
+      regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
+
+      # Only replace the content-type meta tag if <meta> isn't whitelisted or
+      # the original document didn't actually include a content-type meta tag.
+      replace_meta = !@config[:elements].include?('meta') ||
+        node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
+          meta['http-equiv'].downcase == 'content-type'
+        end
+    end
+
+    so = Nokogiri::XML::Node::SaveOptions
+
+    # Serialize to HTML without any formatting to prevent Nokogiri from adding
+    # newlines after certain tags.
+    html = node.to_html(
+      :encoding  => 'utf-8',
+      :indent    => 0,
+      :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
+    )
+
+    html.gsub!(regex_meta, '\1') if replace_meta
+    html
+  end
+
+  def transform_node!(node, node_whitelist)
+    @transformers.each do |transformer|
+      result = transformer.call(
         :config         => @config,
         :is_whitelisted => node_whitelist.include?(node),
         :node           => node,
         :node_name      => node.name.downcase,
-        :node_whitelist => node_whitelist,
-        :traversal_mode => mode
-      })
+        :node_whitelist => node_whitelist
+      )
 
       if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
         node_whitelist.merge(result[:node_whitelist])
       end
     end
 
     node
   end
 
-  # Performs breadth-first traversal, operating first on the root node, then
-  # traversing downwards.
-  def traverse_breadth(node, &block)
+  # Performs top-down traversal of the given node, operating first on the node
+  # itself, then traversing each child (if any) in order.
+  def traverse(node, &block)
     block.call(node)
-    node.children.each {|child| traverse_breadth(child, &block) }
-  end
 
-  # Performs depth-first traversal, operating first on the deepest nodes in the
-  # document, then traversing upwards to the root.
-  def traverse_depth(node, &block)
-    node.children.each {|child| traverse_depth(child, &block) }
-    block.call(node)
+    child = node.child
+
+    while child do
+      prev = child.previous_sibling
+      traverse(child, &block)
+
+      if child.parent != node
+        # The child was unlinked or reparented, so traverse the previous node's
+        # next sibling, or the parent's first child if there is no previous
+        # node.
+        child = prev ? prev.next_sibling : node.child
+      else
+        child = child.next_sibling
+      end
+    end
   end
 
   class Error < StandardError; end
 end