class Sanitize; module Transformers

  class CleanElement

    # Attributes that need additional escaping on `<a>` elements due to unsafe
    # libxml2 behavior.
    UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
      name
    ])

    # Attributes that need additional escaping on all elements due to unsafe
    # libxml2 behavior.
    UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
      action
      href
      src
    ])

    # Mapping of original characters to escape sequences for characters that
    # should be escaped in attributes affected by unsafe libxml2 behavior.
    UNSAFE_LIBXML_ESCAPE_CHARS = {
      ' ' => '%20',
      '"' => '%22'
    }

    # Regex that matches any single character that needs to be escaped in
    # attributes affected by unsafe libxml2 behavior.
    UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/

    def initialize(config)
      @config = config

      # For faster lookups.
      @add_attributes          = config[:add_attributes]
      @allowed_elements        = Set.new(config[:elements])
      @attributes              = config[:attributes]
      @protocols               = config[:protocols]
      @remove_all_contents     = false
      @remove_element_contents = Set.new
      @whitespace_elements     = Set.new(config[:whitespace_elements])

      if config[:remove_contents].is_a?(Array)
        @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
      else
        @remove_all_contents = !!config[:remove_contents]
      end
    end

    def call(env)
      name = env[:node_name]
      node = env[:node]

      return if env[:is_whitelisted] || !node.element?

      # Delete any element that isn't in the config whitelist.
      unless @allowed_elements.include?(name)
        # Elements like br, div, p, etc. need to be replaced with whitespace in
        # order to preserve readability.
        if @whitespace_elements.include?(name)
          node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document))

          unless node.children.empty?
            node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
          end
        end

        unless @remove_all_contents || @remove_element_contents.include?(name)
          node.children.each {|n| node.add_previous_sibling(n) }
        end

        node.unlink
        return
      end

      attr_whitelist = Set.new((@attributes[name] || []) +
          (@attributes[:all] || []))

      allow_data_attributes = attr_whitelist.include?(:data)

      if attr_whitelist.empty?
        # Delete all attributes from elements with no whitelisted attributes.
        node.attribute_nodes.each {|attr| attr.unlink }
      else
        # Delete any attribute that isn't allowed on this element.
        node.attribute_nodes.each do |attr|
          attr_name = attr.name.downcase

          unless attr_whitelist.include?(attr_name)
            # The attribute isn't explicitly whitelisted.

            if allow_data_attributes && attr_name.start_with?('data-')
              # Arbitrary data attributes are allowed. Verify that the attribute
              # is a valid data attribute.
              attr.unlink unless attr_name =~ REGEX_DATA_ATTR
            else
              # Either the attribute isn't a data attribute, or arbitrary data
              # attributes aren't allowed. Remove the attribute.
              attr.unlink
            end
          end
        end

        # Delete remaining attributes that use unacceptable protocols.
        if @protocols.has_key?(name)
          protocol = @protocols[name]

          node.attribute_nodes.each do |attr|
            attr_name = attr.name.downcase
            next false unless protocol.has_key?(attr_name)

            del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
              !protocol[attr_name].include?($1.downcase)
            else
              !protocol[attr_name].include?(:relative)
            end

            if del
              attr.unlink
            else
              # Leading and trailing whitespace around URLs is ignored at parse
              # time. Stripping it here prevents it from being escaped by the
              # libxml2 workaround below.
              attr.value = attr.value.strip
            end
          end
        end
      end

      # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
      # attempt to preserve server-side includes. This can result in XSS since
      # an unescaped double quote can allow an attacker to inject a
      # non-whitelisted attribute.
      #
      # Sanitize works around this by implementing its own escaping for
      # affected attributes, some of which can exist on any element and some
      # of which can only exist on `<a>` elements.
      #
      # The relevant libxml2 code is here:
      # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
      node.attribute_nodes.each do |attr|
        attr_name = attr.name.downcase
        if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
          (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
            attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
        end
      end

      # Add required attributes.
      if @add_attributes.has_key?(name)
        @add_attributes[name].each {|key, val| node[key] = val }
      end
    end
  end

end; end