lib/sanitize.rb in sanitize-1.1.1.dev.20091102 vs lib/sanitize.rb in sanitize-1.2.0.dev.20091104
- old
+ new
@@ -27,25 +27,57 @@
require 'sanitize/config/restricted'
require 'sanitize/config/basic'
require 'sanitize/config/relaxed'
class Sanitize
+ attr_reader :config
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i
#--
+ # Class Methods
+ #++
+
+ # Returns a sanitized copy of _html_, using the settings in _config_ if
+ # specified.
+ def self.clean(html, config = {})
+ sanitize = Sanitize.new(config)
+ sanitize.clean(html)
+ end
+
+ # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
+ # were made.
+ def self.clean!(html, config = {})
+ sanitize = Sanitize.new(config)
+ sanitize.clean!(html)
+ end
+
+ # Sanitizes the specified Nokogiri::XML::Node and all its children.
+ def self.clean_node!(node, config = {})
+ sanitize = Sanitize.new(config)
+ sanitize.clean_node!(node)
+ end
+
+ #--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
+ # Sanitize configuration.
@config = Config::DEFAULT.merge(config)
+ @config[:transformers] = Array(@config[:transformers])
+
+ # Specific nodes to whitelist (along with all their attributes). This array
+ # is generated at runtime by transformers, and is cleared before and after
+ # a fragment is cleaned (so it applies only to a specific fragment).
+ @whitelist_nodes = []
end
# Returns a sanitized copy of _html_.
def clean(html)
dupe = html.dup
@@ -53,103 +85,144 @@
end
# Performs clean in place, returning _html_, or +nil+ if no changes were
# made.
def clean!(html)
+ @whitelist_nodes = []
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
+ clean_node!(fragment)
+ @whitelist_nodes = []
- fragment.traverse do |node|
- if node.comment?
- node.unlink unless @config[:allow_comments]
- elsif node.element?
- name = node.name.to_s.downcase
-
- # Delete any element that isn't in the whitelist.
- unless @config[:elements].include?(name)
- node.children.each { |n| node.add_previous_sibling(n) }
- node.unlink
- next
- end
-
- attr_whitelist = ((@config[:attributes][name] || []) +
- (@config[:attributes][:all] || [])).uniq
-
- if attr_whitelist.empty?
- # Delete all attributes from elements with no whitelisted
- # attributes.
- node.attribute_nodes.each { |attr| attr.remove }
- else
- # Delete any attribute that isn't in the whitelist for this element.
- node.attribute_nodes.each do |attr|
- attr.unlink unless attr_whitelist.include?(attr.name.downcase)
- end
-
- # Delete remaining attributes that use unacceptable protocols.
- if @config[:protocols].has_key?(name)
- protocol = @config[:protocols][name]
-
- node.attribute_nodes.each do |attr|
- attr_name = attr.name.downcase
- next false unless protocol.has_key?(attr_name)
-
- del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
- !protocol[attr_name].include?($1.downcase)
- else
- !protocol[attr_name].include?(:relative)
- end
-
- attr.unlink if del
- end
- end
- end
-
- # Add required attributes.
- if @config[:add_attributes].has_key?(name)
- @config[:add_attributes][name].each do |key, val|
- node[key] = val
- end
- end
- elsif node.cdata?
- node.replace(Nokogiri::XML::Text.new(node.text, node.document))
- end
- end
-
- # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
- # string no matter what we ask for. This will be fixed in 1.4.0, but for
- # now we have to hack around it to prevent errors.
output_method_params = {:encoding => 'utf-8', :indent => 0}
+
if @config[:output] == :xhtml
output_method = fragment.method(:to_xhtml)
- output_method_params.merge!(:save_with => Nokogiri::XML::Node::SaveOptions::AS_XHTML)
+ output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
elsif @config[:output] == :html
output_method = fragment.method(:to_html)
else
raise Error, "unsupported output format: #{@config[:output]}"
end
result = output_method.call(output_method_params)
+
+ # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
+ # string no matter what we ask for. This will be fixed in 1.4.0, but for
+ # now we have to hack around it to prevent errors.
result.force_encoding('utf-8') if RUBY_VERSION >= '1.9'
return result == html ? nil : html[0, html.length] = result
end
- #--
- # Class Methods
- #++
+ # Sanitizes the specified Nokogiri::XML::Node and all its children.
+ def clean_node!(node)
+ raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
- class << self
- # Returns a sanitized copy of _html_, using the settings in _config_ if
- # specified.
- def clean(html, config = {})
- sanitize = Sanitize.new(config)
- sanitize.clean(html)
+ node.traverse do |traversed_node|
+ if traversed_node.element?
+ clean_element!(traversed_node)
+ elsif traversed_node.comment?
+ traversed_node.unlink unless @config[:allow_comments]
+ elsif traversed_node.cdata?
+ traversed_node.replace(Nokogiri::XML::Text.new(traversed_node.text,
+ traversed_node.document))
+ end
end
- # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
- # were made.
- def clean!(html, config = {})
- sanitize = Sanitize.new(config)
- sanitize.clean!(html)
+ node
+ end
+
+ private
+
+ def clean_element!(node)
+ # Run this node through all configured transformers.
+ transform = transform_element!(node)
+
+ # If this node is in the dynamic whitelist array (built at runtime by
+ # transformers), let it live with all of its attributes intact.
+ return if @whitelist_nodes.include?(node)
+
+ name = node.name.to_s.downcase
+
+ # Delete any element that isn't in the whitelist.
+ unless transform[:whitelist] || @config[:elements].include?(name)
+ node.children.each { |n| node.add_previous_sibling(n) }
+ node.unlink
+ return
end
+
+ attr_whitelist = (transform[:attr_whitelist] +
+ (@config[:attributes][name] || []) +
+ (@config[:attributes][:all] || [])).uniq
+
+ if attr_whitelist.empty?
+ # Delete all attributes from elements with no whitelisted attributes.
+ node.attribute_nodes.each {|attr| attr.remove }
+ else
+ # Delete any attribute that isn't in the whitelist for this element.
+ node.attribute_nodes.each do |attr|
+ attr.unlink unless attr_whitelist.include?(attr.name.downcase)
+ end
+
+ # Delete remaining attributes that use unacceptable protocols.
+ if @config[:protocols].has_key?(name)
+ protocol = @config[:protocols][name]
+
+ node.attribute_nodes.each do |attr|
+ attr_name = attr.name.downcase
+ next false unless protocol.has_key?(attr_name)
+
+ del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
+ !protocol[attr_name].include?($1.downcase)
+ else
+ !protocol[attr_name].include?(:relative)
+ end
+
+ attr.unlink if del
+ end
+ end
+ end
+
+ # Add required attributes.
+ if @config[:add_attributes].has_key?(name)
+ @config[:add_attributes][name].each do |key, val|
+ node[key] = val
+ end
+ end
+
+ transform
end
+ def transform_element!(node)
+ output = {
+ :attr_whitelist => [],
+ :node => node,
+ :whitelist => false
+ }
+
+ @config[:transformers].inject(node) do |transformer_node, transformer|
+ transform = transformer.call({
+ :config => @config,
+ :node => transformer_node
+ })
+
+ if transform.nil?
+ transformer_node
+ elsif transform.is_a?(Hash)
+ if transform[:whitelist_nodes].is_a?(Array)
+ @whitelist_nodes += transform[:whitelist_nodes]
+ @whitelist_nodes.uniq!
+ end
+
+ output[:attr_whitelist] += transform[:attr_whitelist] if transform[:attr_whitelist].is_a?(Array)
+ output[:whitelist] ||= true if transform[:whitelist]
+ output[:node] = transform[:node].is_a?(Nokogiri::XML::Node) ? transform[:node] : output[:node]
+ else
+ raise Error, "transformer output must be a Hash or nil"
+ end
+ end
+
+ node.replace(output[:node]) if node != output[:node]
+
+ return output
+ end
end