lib/sanitize.rb in sanitize-1.2.1.dev.20100124 vs lib/sanitize.rb in sanitize-1.2.1.dev.20100329
- old
+ new
@@ -68,21 +68,26 @@
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
# Sanitize configuration.
@config = Config::DEFAULT.merge(config)
- @config[:transformers] = Array(@config[:transformers])
+ @config[:transformers] = Array(@config[:transformers].dup)
- # :remove_contents takes precedence over :escape_only.
- if @config[:remove_contents] && @config[:escape_only]
- @config[:escape_only] = false
- end
-
# Convert the list of allowed elements to a Hash for faster lookup.
@allowed_elements = {}
@config[:elements].each {|el| @allowed_elements[el] = true }
+ # Convert the list of :remove_contents elements to a Hash for faster lookup.
+ @remove_all_contents = false
+ @remove_element_contents = {}
+
+ if @config[:remove_contents].is_a?(Array)
+ @config[:remove_contents].each {|el| @remove_element_contents[el] = true }
+ else
+ @remove_all_contents = !!@config[:remove_contents]
+ end
+
# Specific nodes to whitelist (along with all their attributes). This array
# is generated at runtime by transformers, and is cleared before and after
# a fragment is cleaned (so it applies only to a specific fragment).
@whitelist_nodes = []
end
@@ -97,11 +102,11 @@
# made.
def clean!(html)
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
clean_node!(fragment)
- output_method_params = {:encoding => 'utf-8', :indent => 0}
+ output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
if @config[:output] == :xhtml
output_method = fragment.method(:to_xhtml)
output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
elsif @config[:output] == :html
@@ -110,14 +115,10 @@
raise Error, "unsupported output format: #{@config[:output]}"
end
result = output_method.call(output_method_params)
- # Ensure that the result is always a UTF-8 string in Ruby 1.9, no matter
- # what. Nokogiri seems to return empty strings as ASCII for some reason.
- result.force_encoding('utf-8') if RUBY_VERSION >= '1.9'
-
return result == html ? nil : html[0, html.length] = result
end
# Sanitizes the specified Nokogiri::XML::Node and all its children.
def clean_node!(node)
@@ -127,17 +128,11 @@
node.traverse do |child|
if child.element?
clean_element!(child)
elsif child.comment?
- unless @config[:allow_comments]
- if @config[:escape_only]
- child.replace(Nokogiri::XML::Text.new(child.to_s, child.document))
- else
- child.unlink
- end
- end
+ child.unlink unless @config[:allow_comments]
elsif child.cdata?
child.replace(Nokogiri::XML::Text.new(child.text, child.document))
end
end
@@ -158,18 +153,14 @@
name = node.name.to_s.downcase
# Delete any element that isn't in the whitelist.
unless transform[:whitelist] || @allowed_elements[name]
- if @config[:escape_only]
- node.replace(Nokogiri::XML::Text.new(node.to_s, node.document))
- else
- unless @config[:remove_contents]
- node.children.each { |n| node.add_previous_sibling(n) }
- end
-
- node.unlink
+ unless @remove_all_contents || @remove_element_contents[name]
+ node.children.each { |n| node.add_previous_sibling(n) }
end
+
+ node.unlink
return
end
attr_whitelist = (transform[:attr_whitelist] +