# encoding: utf-8
require 'sanitize'

module HTML
  class Pipeline

    # HTML filter with sanization routines and whitelists. This module defines
    # what HTML is allowed in user provided content and fixes up issues with
    # unbalanced tags and whatnot.
    #
    # See the Sanitize docs for more information on the underlying library:
    #
    # https://github.com/rgrove/sanitize/#readme
    #
    # Context options:
    #   :whitelist - The sanitizer whitelist configuration to use. This can be one
    #                of the options constants defined in this class or a custom
    #                sanitize options hash.
    #
    # This filter does not write additional information to the context.
    class SanitizationFilter < Filter

      # The main sanitization whitelist. Only these elements and attributes are
      # allowed through by default.
      WHITELIST = {
        :output => :xhtml,
        :elements => %w(a abbr b blockquote br cite code dd del dfn div dl dt em
                        h1 h2 h3 h4 h5 h6 hr i img ins kbd li mark meter ol p pre
                        q s samp small span strong sub sup table tbody td tfooter
                        th thead tr time ul var video wbr),
        :remove_contents => ['script'],
        :attributes => {
          :all         => ['data-after', 'data-id', 'id', 'title', 'class'],
          'a'          => ['href', 'name'],
          'blockquote' => ['cite'],
          'img'        => ['alt', 'height', 'src', 'width'],
          'q'          => ['cite'],
          'time'       => ['datetime'],
          'video'      => ['src']
        },
        :protocols => {
          'a'          => {'href' => ['ftp', 'http', 'https', 'irc', 'mailto', 'xmpp', :relative]},
          'blockquote' => {'cite' => ['http', 'https', :relative]},
          'img'        => {'src'  => ['http', 'https', :relative]},
          'q'          => {'cite' => ['http', 'https', :relative]}
        }
      }

      # A more limited sanitization whitelist. This includes all attributes,
      # protocols, and transformers from WHITELIST but with a more locked down
      # set of allowed elements.
      LIMITED = WHITELIST.merge(
        :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))

      # Strip all HTML tags from the document.
      FULL = { :elements => [] }

      # Match unicode chars encoded on 4 bytes in UTF-8
      MB4_REGEXP = /[^\u{9}-\u{ffff}]/

      # Remove utf-8 characters encoded on 4 bytes,
      # because MySQL doesn't handle them.
      def encode_mb4(doc)
        doc.search("text()").each do |node|
          node.content = node.content.gsub(MB4_REGEXP) { |c| "&##{c.unpack('U')[0]};" }
        end
        doc
      end

      # Sanitize markup using the Sanitize library.
      def call
        encode_mb4 Sanitize.clean_node!(doc, whitelist)
      end

      # The whitelist to use when sanitizing. This can be passed in the context
      # hash to the filter but defaults to WHITELIST constant value above.
      def whitelist
        context[:whitelist] || WHITELIST
      end
    end

  end
end