lib/sanitize.rb in sanitize-1.0.8 vs lib/sanitize.rb in sanitize-1.1.0
- old
+ new
@@ -1,5 +1,6 @@
+# encoding: utf-8
#--
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the 'Software'), to deal
@@ -18,44 +19,25 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#++
-# Append this file's directory to the include path if it's not there already.
-$:.unshift(File.dirname(File.expand_path(__FILE__)))
-$:.uniq!
-
-require 'rubygems'
-
-gem 'hpricot', '~> 0.8.1'
-
-require 'hpricot'
+require 'nokogiri'
+require 'sanitize/version'
require 'sanitize/config'
require 'sanitize/config/restricted'
require 'sanitize/config/basic'
require 'sanitize/config/relaxed'
class Sanitize
- # Characters that should be replaced with entities in text nodes.
- ENTITY_MAP = {
- '<' => '<',
- '>' => '>',
- '"' => '"',
- "'" => '''
- }
-
- # Matches an unencoded ampersand that is not part of a valid character entity
- # reference.
- REGEX_AMPERSAND = /&(?!(?:[a-z]+[0-9]{0,2}|#[0-9]+|#x[0-9a-f]+);)/i
-
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
- REGEX_PROTOCOL = /^([^:]*)(?:\:|�*58|�*3a)/i
+ REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i
#--
# Instance Methods
#++
@@ -71,82 +53,86 @@
end
# Performs clean in place, returning _html_, or +nil+ if no changes were
# made.
def clean!(html)
- fragment = Hpricot(html)
+ fragment = Nokogiri::HTML::DocumentFragment.parse(html)
- fragment.search('*') do |node|
- if node.bogusetag? || node.doctype? || node.procins? || node.xmldecl?
- node.parent.replace_child(node, '')
- next
- end
-
+ fragment.traverse do |node|
if node.comment?
- node.parent.replace_child(node, '') unless @config[:allow_comments]
- elsif node.elem?
+ node.unlink unless @config[:allow_comments]
+ elsif node.element?
name = node.name.to_s.downcase
# Delete any element that isn't in the whitelist.
unless @config[:elements].include?(name)
- node.parent.replace_child(node, node.children || '')
+ node.children.each { |n| node.add_previous_sibling(n) }
+ node.unlink
next
end
- node.raw_attributes ||= {}
-
attr_whitelist = ((@config[:attributes][name] || []) +
(@config[:attributes][:all] || [])).uniq
if attr_whitelist.empty?
# Delete all attributes from elements with no whitelisted
# attributes.
- node.raw_attributes = {}
+ node.attribute_nodes.each { |attr| attr.remove }
else
# Delete any attribute that isn't in the whitelist for this element.
- node.raw_attributes.delete_if do |key, value|
- !attr_whitelist.include?(key.to_s.downcase)
+ node.attribute_nodes.each do |attr|
+ attr.unlink unless attr_whitelist.include?(attr.name.downcase)
end
# Delete remaining attributes that use unacceptable protocols.
if @config[:protocols].has_key?(name)
protocol = @config[:protocols][name]
- node.raw_attributes.delete_if do |key, value|
- key = key.to_s.downcase
- next false unless protocol.has_key?(key)
- next true if value.nil?
+ node.attribute_nodes.each do |attr|
+ attr_name = attr.name.downcase
+ next false unless protocol.has_key?(attr_name)
- if value.to_s.downcase =~ REGEX_PROTOCOL
- !protocol[key].include?($1.downcase)
+ del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
+ !protocol[attr_name].include?($1.downcase)
else
- !protocol[key].include?(:relative)
+ !protocol[attr_name].include?(:relative)
end
+
+ attr.unlink if del
end
end
end
# Add required attributes.
if @config[:add_attributes].has_key?(name)
- node.raw_attributes.merge!(@config[:add_attributes][name])
+ @config[:add_attributes][name].each do |key, val|
+ node[key] = val
+ end
end
-
- # Escape special chars in attribute values.
- node.raw_attributes.each do |key, value|
- node.raw_attributes[key] = Sanitize.encode_html(value)
- end
+ elsif node.cdata?
+ node.replace(Nokogiri::XML::Text.new(node.text, node.document))
end
end
- # Make one last pass through the fragment and encode all special HTML chars
- # as entities. This eliminates certain types of maliciously-malformed nested
- # tags.
- fragment.search('*') do |node|
- node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
+ if @config[:output] == :xhtml
+ output_method = fragment.method(:to_xhtml)
+ elsif @config[:output] == :html
+ output_method = fragment.method(:to_html)
+ else
+ raise Error, "unsupported output format: #{@config[:output]}"
end
- result = fragment.to_s
+ if RUBY_VERSION >= '1.9'
+ # Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
+ # string no matter what we ask for. This will be fixed in 1.4.0, but for
+ # now we have to hack around it to prevent errors.
+ result = output_method.call(:encoding => 'utf-8', :indent => 0).force_encoding('utf-8')
+ result.gsub!(">\n", '>')
+ else
+ result = output_method.call(:encoding => 'utf-8', :indent => 0).gsub(">\n", '>')
+ end
+
return result == html ? nil : html[0, html.length] = result
end
#--
# Class Methods
@@ -163,21 +149,9 @@
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
# were made.
def clean!(html, config = {})
sanitize = Sanitize.new(config)
sanitize.clean!(html)
- end
-
- # Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
- # references and returns the encoded string.
- def encode_html(html)
- str = html.dup
-
- # Encode special chars.
- ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
-
- # Convert unencoded ampersands to entity references.
- str.gsub(REGEX_AMPERSAND, '&')
end
end
end