# encoding: utf-8 require 'sanitize' module HTML class Pipeline # HTML filter with sanization routines and whitelists. This module defines # what HTML is allowed in user provided content and fixes up issues with # unbalanced tags and whatnot. # # See the Sanitize docs for more information on the underlying library: # # https://github.com/rgrove/sanitize/#readme # # Context options: # :whitelist - The sanitizer whitelist configuration to use. This can be one # of the options constants defined in this class or a custom # sanitize options hash. # # This filter does not write additional information to the context. class SanitizationFilter < Filter # The main sanitization whitelist. Only these elements and attributes are # allowed through by default. WHITELIST = { :output => :xhtml, :elements => %w(a abbr b blockquote br cite code dd del dfn div dl dt em h1 h2 h3 h4 h5 h6 hr i img ins kbd li mark meter ol p pre q s samp small span strong sub sup table tbody td tfooter th thead tr time ul var video wbr), :remove_contents => ['script'], :attributes => { :all => ['data-after', 'data-id', 'id', 'title', 'class'], 'a' => ['href', 'name'], 'blockquote' => ['cite'], 'img' => ['alt', 'height', 'src', 'width'], 'q' => ['cite'], 'time' => ['datetime'], 'video' => ['src'] }, :protocols => { 'a' => {'href' => ['ftp', 'http', 'https', 'irc', 'mailto', 'xmpp', :relative]}, 'blockquote' => {'cite' => ['http', 'https', :relative]}, 'img' => {'src' => ['http', 'https', :relative]}, 'q' => {'cite' => ['http', 'https', :relative]} } } # A more limited sanitization whitelist. This includes all attributes, # protocols, and transformers from WHITELIST but with a more locked down # set of allowed elements. LIMITED = WHITELIST.merge( :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li)) # Strip all HTML tags from the document. FULL = { :elements => [] } # Match unicode chars encoded on 4 bytes in UTF-8 MB4_REGEXP = /[^\u{9}-\u{999}]/ # Remove utf-8 characters encoded on 4 bytes, # because MySQL doesn't handle them. def encode_mb4(doc) doc.search("text()").each do |node| node.content = node.content.gsub(MB4_REGEXP) { |c| "&##{c.unpack('U')[0]};" } end doc end # Sanitize markup using the Sanitize library. def call encode_mb4 Sanitize.clean_node!(doc, whitelist) end # The whitelist to use when sanitizing. This can be passed in the context # hash to the filter but defaults to WHITELIST constant value above. def whitelist context[:whitelist] || WHITELIST end end end end