lib/sanitize.rb in sanitize-2.1.1 vs lib/sanitize.rb in sanitize-3.0.0

- old
+ new

@@ -1,215 +1,260 @@ # encoding: utf-8 -#-- -# Copyright (c) 2013 Ryan Grove <ryan@wonko.com> -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the 'Software'), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -#++ +require 'nokogumbo' require 'set' -require 'nokogiri' -require 'sanitize/version' -require 'sanitize/config' -require 'sanitize/config/restricted' -require 'sanitize/config/basic' -require 'sanitize/config/relaxed' -require 'sanitize/transformers/clean_cdata' -require 'sanitize/transformers/clean_comment' -require 'sanitize/transformers/clean_element' +require_relative 'sanitize/version' +require_relative 'sanitize/config' +require_relative 'sanitize/config/default' +require_relative 'sanitize/config/restricted' +require_relative 'sanitize/config/basic' +require_relative 'sanitize/config/relaxed' +require_relative 'sanitize/css' +require_relative 'sanitize/transformers/clean_cdata' +require_relative 'sanitize/transformers/clean_comment' +require_relative 'sanitize/transformers/clean_css' +require_relative 'sanitize/transformers/clean_doctype' +require_relative 'sanitize/transformers/clean_element' class Sanitize attr_reader :config - # Matches a valid HTML5 data attribute name. The unicode ranges included here - # are a conservative subset of the full range of characters that are - # technically allowed, with the intent of matching the most common characters - # used in data attribute names while excluding uncommon or potentially - # misleading characters, or characters with the potential to be normalized - # into unsafe or confusing forms. - # - # If you need data attr names with characters that aren't included here (such - # as combining marks, full-width characters, or CJK), please consider creating - # a custom transformer to validate attributes according to your needs. - # - # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes - REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u - # Matches an attribute value that could be treated by a browser as a URL # with a protocol prefix, such as "http:" or "javascript:". Any string of zero # or more characters followed by a colon is considered a match, even if the # colon is encoded as an entity and even if it's an incomplete entity (which # IE6 and Opera will still parse). REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i + # Matches Unicode characters that should be stripped from HTML before passing + # it to the parser. + # + # http://www.w3.org/TR/unicode-xml/#Charlist + REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u + #-- # Class Methods #++ - # Returns a sanitized copy of _html_, using the settings in _config_ if - # specified. - def self.clean(html, config = {}) - Sanitize.new(config).clean(html) + # Returns a sanitized copy of the given full _html_ document, using the + # settings in _config_ if specified. + # + # When sanitizing a document, the `<html>` element must be whitelisted or an + # error will be raised. If this is undesirable, you should probably use + # {#fragment} instead. + def self.document(html, config = {}) + Sanitize.new(config).document(html) end - # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes - # were made. - def self.clean!(html, config = {}) - Sanitize.new(config).clean!(html) + # Returns a sanitized copy of the given _html_ fragment, using the settings in + # _config_ if specified. + def self.fragment(html, config = {}) + Sanitize.new(config).fragment(html) end - # Performs a Sanitize#clean using a full-document HTML parser instead of - # the default fragment parser. This will add a DOCTYPE and html tag - # unless they are already present - def self.clean_document(html, config = {}) - Sanitize.new(config).clean_document(html) + # Sanitizes the given `Nokogiri::XML::Node` instance and all its children. + def self.node!(node, config = {}) + Sanitize.new(config).node!(node) end - # Performs Sanitize#clean_document in place, returning _html_, or +nil+ if no - # changes were made. - def self.clean_document!(html, config = {}) - Sanitize.new(config).clean_document!(html) - end + # Aliases for pre-3.0.0 backcompat. + class << Sanitize + # @deprecated Use {.document} instead. + alias_method :clean_document, :document - # Sanitizes the specified Nokogiri::XML::Node and all its children. - def self.clean_node!(node, config = {}) - Sanitize.new(config).clean_node!(node) + # @deprecated Use {.fragment} instead. + alias_method :clean, :fragment + + # @deprecated Use {.node!} instead. + alias_method :clean_node!, :node! end #-- # Instance Methods #++ # Returns a new Sanitize object initialized with the settings in _config_. def initialize(config = {}) - @config = Config::DEFAULT.merge(config) + @config = Config.merge(Config::DEFAULT, config) - @transformers = { - :breadth => Array(@config[:transformers_breadth].dup), - :depth => Array(@config[:transformers]) + Array(@config[:transformers_depth]) - } + @transformers = Array(@config[:transformers].dup) - # Default depth transformers. These always run at the end of the chain, - # after any custom transformers. - @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments] + # Default transformers always run at the end of the chain, after any custom + # transformers. + @transformers << Transformers::CleanComment unless @config[:allow_comments] + @transformers << Transformers::CleanDoctype unless @config[:allow_doctype] - @transformers[:depth] << + if @config[:elements].include?('style') + scss = Sanitize::CSS.new(config) + @transformers << Transformers::CSS::CleanElement.new(scss) + end + + if @config[:attributes].values.any? {|attr| attr.include?('style') } + scss ||= Sanitize::CSS.new(config) + @transformers << Transformers::CSS::CleanAttribute.new(scss) + end + + @transformers << Transformers::CleanCDATA << Transformers::CleanElement.new(@config) end - # Returns a sanitized copy of the given _html_ fragment. - def clean(html) - if html - dupe = html.dup - clean!(dupe) || dupe - end + # Returns a sanitized copy of the given _html_ document. + # + # When sanitizing a document, the `<html>` element must be whitelisted or an + # error will be raised. If this is undesirable, you should probably use + # {#fragment} instead. + def document(html) + return '' unless html + + doc = Nokogiri::HTML5.parse(preprocess(html)) + node!(doc) + to_html(doc) end - # Performs clean in place, returning _html_, or +nil+ if no changes were - # made. - def clean!(html, parser = Nokogiri::HTML::DocumentFragment) - fragment = parser.parse(html) - clean_node!(fragment) + # @deprecated Use {#document} instead. + alias_method :clean_document, :document - output_method_params = {:encoding => @config[:output_encoding], :indent => 0} + # Returns a sanitized copy of the given _html_ fragment. + def fragment(html) + return '' unless html - if @config[:output] == :xhtml - output_method = fragment.method(:to_xhtml) - output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML - elsif @config[:output] == :html - output_method = fragment.method(:to_html) + html = preprocess(html) + doc = Nokogiri::HTML5.parse("<html><body>#{html}") + + # Hack to allow fragments containing <body>. Borrowed from + # Nokogiri::HTML::DocumentFragment. + if html =~ /\A<body(?:\s|>)/i + path = '/html/body' else - raise Error, "unsupported output format: #{@config[:output]}" + path = '/html/body/node()' end - result = output_method.call(output_method_params) + frag = doc.fragment + doc.xpath(path).each {|node| frag << node } - return result == html ? nil : html[0, html.length] = result + node!(frag) + to_html(frag) end - # Returns a sanitized copy of the given full _html_ document. - def clean_document(html) - unless html.nil? - clean_document!(html.dup) || html + # @deprecated Use {#fragment} instead. + alias_method :clean, :fragment + + # Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it + # in place. + # + # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be + # whitelisted or an error will be raised. + def node!(node) + raise ArgumentError unless node.is_a?(Nokogiri::XML::Node) + + if node.is_a?(Nokogiri::XML::Document) + unless @config[:elements].include?('html') + raise Error, 'When sanitizing a document, "<html>" must be whitelisted.' + end end - end - # Performs clean_document in place, returning _html_, or +nil+ if no changes - # were made. - def clean_document!(html) - if !@config[:elements].include?('html') && !@config[:remove_contents] - raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true' - # otherwise Nokogiri will raise for having multiple root nodes when - # it moves its children to the root document context + node_whitelist = Set.new + + traverse(node) do |n| + transform_node!(n, node_whitelist) end - clean!(html, Nokogiri::HTML::Document) + node end - # Sanitizes the specified Nokogiri::XML::Node and all its children. - def clean_node!(node) - raise ArgumentError unless node.is_a?(Nokogiri::XML::Node) + # @deprecated Use {#node!} instead. + alias_method :clean_node!, :node! - node_whitelist = Set.new + private - unless @transformers[:breadth].empty? - traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) } + # Preprocesses HTML before parsing to remove undesirable Unicode chars. + def preprocess(html) + html.to_s.dup + + unless html.encoding.name == 'UTF-8' + html.encode!('UTF-8', + :invalid => :replace, + :undef => :replace) end - traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) } - node + html.gsub!(REGEX_UNSUITABLE_CHARS, '') + html end - private + def to_html(node) + replace_meta = false - def transform_node!(node, node_whitelist, mode) - @transformers[mode].each do |transformer| - result = transformer.call({ + # Hacky workaround for a libxml2 bug that adds an undesired Content-Type + # meta tag to all serialized HTML documents. + # + # https://github.com/sparklemotion/nokogiri/issues/1008 + if node.type == Nokogiri::XML::Node::DOCUMENT_NODE || + node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE + + regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i + + # Only replace the content-type meta tag if <meta> isn't whitelisted or + # the original document didn't actually include a content-type meta tag. + replace_meta = !@config[:elements].include?('meta') || + node.xpath('/html/head/meta[@http-equiv]').none? do |meta| + meta['http-equiv'].downcase == 'content-type' + end + end + + so = Nokogiri::XML::Node::SaveOptions + + # Serialize to HTML without any formatting to prevent Nokogiri from adding + # newlines after certain tags. + html = node.to_html( + :encoding => 'utf-8', + :indent => 0, + :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML + ) + + html.gsub!(regex_meta, '\1') if replace_meta + html + end + + def transform_node!(node, node_whitelist) + @transformers.each do |transformer| + result = transformer.call( :config => @config, :is_whitelisted => node_whitelist.include?(node), :node => node, :node_name => node.name.downcase, - :node_whitelist => node_whitelist, - :traversal_mode => mode - }) + :node_whitelist => node_whitelist + ) if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each) node_whitelist.merge(result[:node_whitelist]) end end node end - # Performs breadth-first traversal, operating first on the root node, then - # traversing downwards. - def traverse_breadth(node, &block) + # Performs top-down traversal of the given node, operating first on the node + # itself, then traversing each child (if any) in order. + def traverse(node, &block) block.call(node) - node.children.each {|child| traverse_breadth(child, &block) } - end - # Performs depth-first traversal, operating first on the deepest nodes in the - # document, then traversing upwards to the root. - def traverse_depth(node, &block) - node.children.each {|child| traverse_depth(child, &block) } - block.call(node) + child = node.child + + while child do + prev = child.previous_sibling + traverse(child, &block) + + if child.parent != node + # The child was unlinked or reparented, so traverse the previous node's + # next sibling, or the parent's first child if there is no previous + # node. + child = prev ? prev.next_sibling : node.child + else + child = child.next_sibling + end + end end class Error < StandardError; end end