lib/sanitize.rb in sanitize-1.2.1.dev.20100124 vs lib/sanitize.rb in sanitize-1.2.1.dev.20100329

- old
+ new

@@ -68,21 +68,26 @@ # Returns a new Sanitize object initialized with the settings in _config_. def initialize(config = {}) # Sanitize configuration. @config = Config::DEFAULT.merge(config) - @config[:transformers] = Array(@config[:transformers]) + @config[:transformers] = Array(@config[:transformers].dup) - # :remove_contents takes precedence over :escape_only. - if @config[:remove_contents] && @config[:escape_only] - @config[:escape_only] = false - end - # Convert the list of allowed elements to a Hash for faster lookup. @allowed_elements = {} @config[:elements].each {|el| @allowed_elements[el] = true } + # Convert the list of :remove_contents elements to a Hash for faster lookup. + @remove_all_contents = false + @remove_element_contents = {} + + if @config[:remove_contents].is_a?(Array) + @config[:remove_contents].each {|el| @remove_element_contents[el] = true } + else + @remove_all_contents = !!@config[:remove_contents] + end + # Specific nodes to whitelist (along with all their attributes). This array # is generated at runtime by transformers, and is cleared before and after # a fragment is cleaned (so it applies only to a specific fragment). @whitelist_nodes = [] end @@ -97,11 +102,11 @@ # made. def clean!(html) fragment = Nokogiri::HTML::DocumentFragment.parse(html) clean_node!(fragment) - output_method_params = {:encoding => 'utf-8', :indent => 0} + output_method_params = {:encoding => @config[:output_encoding], :indent => 0} if @config[:output] == :xhtml output_method = fragment.method(:to_xhtml) output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML elsif @config[:output] == :html @@ -110,14 +115,10 @@ raise Error, "unsupported output format: #{@config[:output]}" end result = output_method.call(output_method_params) - # Ensure that the result is always a UTF-8 string in Ruby 1.9, no matter - # what. Nokogiri seems to return empty strings as ASCII for some reason. - result.force_encoding('utf-8') if RUBY_VERSION >= '1.9' - return result == html ? nil : html[0, html.length] = result end # Sanitizes the specified Nokogiri::XML::Node and all its children. def clean_node!(node) @@ -127,17 +128,11 @@ node.traverse do |child| if child.element? clean_element!(child) elsif child.comment? - unless @config[:allow_comments] - if @config[:escape_only] - child.replace(Nokogiri::XML::Text.new(child.to_s, child.document)) - else - child.unlink - end - end + child.unlink unless @config[:allow_comments] elsif child.cdata? child.replace(Nokogiri::XML::Text.new(child.text, child.document)) end end @@ -158,18 +153,14 @@ name = node.name.to_s.downcase # Delete any element that isn't in the whitelist. unless transform[:whitelist] || @allowed_elements[name] - if @config[:escape_only] - node.replace(Nokogiri::XML::Text.new(node.to_s, node.document)) - else - unless @config[:remove_contents] - node.children.each { |n| node.add_previous_sibling(n) } - end - - node.unlink + unless @remove_all_contents || @remove_element_contents[name] + node.children.each { |n| node.add_previous_sibling(n) } end + + node.unlink return end attr_whitelist = (transform[:attr_whitelist] +