lib/sanitize.rb in sanitize-1.3.0.dev.20101210 vs lib/sanitize.rb in sanitize-2.0.0.dev.20101211
- old
+ new
@@ -19,16 +19,21 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#++
+require 'set'
+
require 'nokogiri'
require 'sanitize/version'
require 'sanitize/config'
require 'sanitize/config/restricted'
require 'sanitize/config/basic'
require 'sanitize/config/relaxed'
+require 'sanitize/transformers/clean_cdata'
+require 'sanitize/transformers/clean_comment'
+require 'sanitize/transformers/clean_element'
class Sanitize
attr_reader :config
# Matches an attribute value that could be treated by a browser as a URL
@@ -43,58 +48,39 @@
#++
# Returns a sanitized copy of _html_, using the settings in _config_ if
# specified.
def self.clean(html, config = {})
- sanitize = Sanitize.new(config)
- sanitize.clean(html)
+ Sanitize.new(config).clean(html)
end
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
# were made.
def self.clean!(html, config = {})
- sanitize = Sanitize.new(config)
- sanitize.clean!(html)
+ Sanitize.new(config).clean!(html)
end
# Sanitizes the specified Nokogiri::XML::Node and all its children.
def self.clean_node!(node, config = {})
- sanitize = Sanitize.new(config)
- sanitize.clean_node!(node)
+ Sanitize.new(config).clean_node!(node)
end
#--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
- # Sanitize configuration.
- @config = Config::DEFAULT.merge(config)
- @config[:transformers] = Array(@config[:transformers].dup)
+ @config = Config::DEFAULT.merge(config)
+ @transformers = Array(@config[:transformers].dup)
- # Convert arrays to hashes for faster lookups.
- @allowed_elements = {}
- @whitespace_elements = {}
-
- @config[:elements].each {|el| @allowed_elements[el] = true }
- @config[:whitespace_elements].each {|el| @whitespace_elements[el] = true }
-
- # Convert the list of :remove_contents elements to a Hash for faster lookup.
- @remove_all_contents = false
- @remove_element_contents = {}
-
- if @config[:remove_contents].is_a?(Array)
- @config[:remove_contents].each {|el| @remove_element_contents[el] = true }
- else
- @remove_all_contents = !!@config[:remove_contents]
- end
-
- # Specific nodes to whitelist (along with all their attributes). This array
- # is generated at runtime by transformers, and is cleared before and after
- # a fragment is cleaned (so it applies only to a specific fragment).
- @whitelist_nodes = []
+ # Default transformers. These always run at the end of the transformer
+ # chain, after any custom transformers.
+ @transformers <<
+ Transformers::CleanComment <<
+ Transformers::CleanCDATA <<
+ Transformers::CleanElement.new(@config)
end
# Returns a sanitized copy of _html_.
def clean(html)
if html
@@ -127,133 +113,37 @@
# Sanitizes the specified Nokogiri::XML::Node and all its children.
def clean_node!(node)
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
- @whitelist_nodes = []
+ node_whitelist = Set.new
+ node.traverse {|child| transform_node!(child, node_whitelist) }
- node.traverse do |child|
- if child.element? || (child.text? && @config[:process_text_nodes])
- clean_element!(child)
- elsif child.comment?
- child.unlink unless @config[:allow_comments]
- elsif child.cdata?
- child.replace(Nokogiri::XML::Text.new(child.text, child.document))
- end
- end
-
- @whitelist_nodes = []
-
node
end
private
- def clean_element!(node)
- # Run this node through all configured transformers.
- transform = transform_element!(node)
-
- # If this node is in the dynamic whitelist array (built at runtime by
- # transformers), let it live with all of its attributes intact.
- return if @whitelist_nodes.include?(node)
-
- name = node.name.to_s.downcase
-
- # Delete any element that isn't in the whitelist.
- unless transform[:whitelist] || @allowed_elements[name]
- # Elements like br, div, p, etc. need to be replaced with whitespace in
- # order to preserve readability.
- if @whitespace_elements[name]
- node.add_previous_sibling(' ')
- node.add_next_sibling(' ') unless node.children.empty?
- end
-
- unless @remove_all_contents || @remove_element_contents[name]
- node.children.each { |n| node.add_previous_sibling(n) }
- end
-
- node.unlink
-
- return
- end
-
- attr_whitelist = (transform[:attr_whitelist] +
- (@config[:attributes][name] || []) +
- (@config[:attributes][:all] || [])).uniq
-
- if attr_whitelist.empty?
- # Delete all attributes from elements with no whitelisted attributes.
- node.attribute_nodes.each {|attr| attr.remove }
- else
- # Delete any attribute that isn't in the whitelist for this element.
- node.attribute_nodes.each do |attr|
- attr.unlink unless attr_whitelist.include?(attr.name.downcase)
- end
-
- # Delete remaining attributes that use unacceptable protocols.
- if @config[:protocols].has_key?(name)
- protocol = @config[:protocols][name]
-
- node.attribute_nodes.each do |attr|
- attr_name = attr.name.downcase
- next false unless protocol.has_key?(attr_name)
-
- del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
- !protocol[attr_name].include?($1.downcase)
- else
- !protocol[attr_name].include?(:relative)
- end
-
- attr.unlink if del
- end
- end
- end
-
- # Add required attributes.
- if @config[:add_attributes].has_key?(name)
- @config[:add_attributes][name].each do |key, val|
- node[key] = val
- end
- end
-
- transform
- end
-
- def transform_element!(node)
- output = {
- :attr_whitelist => [],
- :node => node,
- :whitelist => false
- }
-
- @config[:transformers].inject(node) do |transformer_node, transformer|
- transform = transformer.call({
- :allowed_elements => @allowed_elements,
- :config => @config,
- :node => transformer_node,
- :node_name => transformer_node.name.downcase,
- :whitelist_nodes => @whitelist_nodes
+ def transform_node!(node, node_whitelist)
+ @transformers.each do |transformer|
+ result = transformer.call({
+ :config => @config,
+ :is_whitelisted => node_whitelist.include?(node),
+ :node => node,
+ :node_name => node.name.downcase,
+ :node_whitelist => node_whitelist
})
- if transform.nil?
- transformer_node
- elsif transform.is_a?(Hash)
- if transform[:whitelist_nodes].is_a?(Array)
- @whitelist_nodes += transform[:whitelist_nodes]
- @whitelist_nodes.uniq!
- end
+ # If the node has been unlinked, there's no point running subsequent
+ # transformers.
+ break if node.parent.nil? && !node.fragment?
- output[:attr_whitelist] += transform[:attr_whitelist] if transform[:attr_whitelist].is_a?(Array)
- output[:whitelist] ||= true if transform[:whitelist]
- output[:node] = transform[:node].is_a?(Nokogiri::XML::Node) ? transform[:node] : output[:node]
- else
- raise Error, "transformer output must be a Hash or nil"
+ if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
+ node_whitelist.merge(result[:node_whitelist])
end
end
- node.replace(output[:node]) if node != output[:node]
-
- return output
+ node
end
class Error < StandardError; end
end