# encoding: utf-8 #-- # Copyright (c) 2010 Ryan Grove # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the 'Software'), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. #++ require 'nokogiri' require 'sanitize/version' require 'sanitize/config' require 'sanitize/config/restricted' require 'sanitize/config/basic' require 'sanitize/config/relaxed' class Sanitize attr_reader :config # Matches an attribute value that could be treated by a browser as a URL # with a protocol prefix, such as "http:" or "javascript:". Any string of zero # or more characters followed by a colon is considered a match, even if the # colon is encoded as an entity and even if it's an incomplete entity (which # IE6 and Opera will still parse). REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i #-- # Class Methods #++ # Returns a sanitized copy of _html_, using the settings in _config_ if # specified. def self.clean(html, config = {}) sanitize = Sanitize.new(config) sanitize.clean(html) end # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes # were made. def self.clean!(html, config = {}) sanitize = Sanitize.new(config) sanitize.clean!(html) end # Sanitizes the specified Nokogiri::XML::Node and all its children. def self.clean_node!(node, config = {}) sanitize = Sanitize.new(config) sanitize.clean_node!(node) end #-- # Instance Methods #++ # Returns a new Sanitize object initialized with the settings in _config_. def initialize(config = {}) # Sanitize configuration. @config = Config::DEFAULT.merge(config) @config[:transformers] = Array(@config[:transformers].dup) # Convert the list of allowed elements to a Hash for faster lookup. @allowed_elements = {} @config[:elements].each {|el| @allowed_elements[el] = true } # Convert the list of :remove_contents elements to a Hash for faster lookup. @remove_all_contents = false @remove_element_contents = {} if @config[:remove_contents].is_a?(Array) @config[:remove_contents].each {|el| @remove_element_contents[el] = true } else @remove_all_contents = !!@config[:remove_contents] end # Specific nodes to whitelist (along with all their attributes). This array # is generated at runtime by transformers, and is cleared before and after # a fragment is cleaned (so it applies only to a specific fragment). @whitelist_nodes = [] end # Returns a sanitized copy of _html_. def clean(html) if html dupe = html.dup clean!(dupe) || dupe end end # Performs clean in place, returning _html_, or +nil+ if no changes were # made. def clean!(html) fragment = Nokogiri::HTML::DocumentFragment.parse(html) clean_node!(fragment) output_method_params = {:encoding => @config[:output_encoding], :indent => 0} if @config[:output] == :xhtml output_method = fragment.method(:to_xhtml) output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML elsif @config[:output] == :html output_method = fragment.method(:to_html) else raise Error, "unsupported output format: #{@config[:output]}" end result = output_method.call(output_method_params) return result == html ? nil : html[0, html.length] = result end # Sanitizes the specified Nokogiri::XML::Node and all its children. def clean_node!(node) raise ArgumentError unless node.is_a?(Nokogiri::XML::Node) @whitelist_nodes = [] node.traverse do |child| if child.element? || (child.text? && @config[:process_text_nodes]) clean_element!(child) elsif child.comment? child.unlink unless @config[:allow_comments] elsif child.cdata? child.replace(Nokogiri::XML::Text.new(child.text, child.document)) end end @whitelist_nodes = [] node end private def clean_element!(node) # Run this node through all configured transformers. transform = transform_element!(node) # If this node is in the dynamic whitelist array (built at runtime by # transformers), let it live with all of its attributes intact. return if @whitelist_nodes.include?(node) name = node.name.to_s.downcase # Delete any element that isn't in the whitelist. unless transform[:whitelist] || @allowed_elements[name] unless @remove_all_contents || @remove_element_contents[name] node.children.each { |n| node.add_previous_sibling(n) } end node.unlink return end attr_whitelist = (transform[:attr_whitelist] + (@config[:attributes][name] || []) + (@config[:attributes][:all] || [])).uniq if attr_whitelist.empty? # Delete all attributes from elements with no whitelisted attributes. node.attribute_nodes.each {|attr| attr.remove } else # Delete any attribute that isn't in the whitelist for this element. node.attribute_nodes.each do |attr| attr.unlink unless attr_whitelist.include?(attr.name.downcase) end # Delete remaining attributes that use unacceptable protocols. if @config[:protocols].has_key?(name) protocol = @config[:protocols][name] node.attribute_nodes.each do |attr| attr_name = attr.name.downcase next false unless protocol.has_key?(attr_name) del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL !protocol[attr_name].include?($1.downcase) else !protocol[attr_name].include?(:relative) end attr.unlink if del end end end # Add required attributes. if @config[:add_attributes].has_key?(name) @config[:add_attributes][name].each do |key, val| node[key] = val end end transform end def transform_element!(node) output = { :attr_whitelist => [], :node => node, :whitelist => false } @config[:transformers].inject(node) do |transformer_node, transformer| transform = transformer.call({ :allowed_elements => @allowed_elements, :config => @config, :node => transformer_node, :node_name => transformer_node.name.downcase, :whitelist_nodes => @whitelist_nodes }) if transform.nil? transformer_node elsif transform.is_a?(Hash) if transform[:whitelist_nodes].is_a?(Array) @whitelist_nodes += transform[:whitelist_nodes] @whitelist_nodes.uniq! end output[:attr_whitelist] += transform[:attr_whitelist] if transform[:attr_whitelist].is_a?(Array) output[:whitelist] ||= true if transform[:whitelist] output[:node] = transform[:node].is_a?(Nokogiri::XML::Node) ? transform[:node] : output[:node] else raise Error, "transformer output must be a Hash or nil" end end node.replace(output[:node]) if node != output[:node] return output end class Error < StandardError; end end