class TextUtils::HtmlSanitizer < TextUtils::Processor RELAXED = { elements: [ 'a', 'b', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'dd', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'img', 'li', 'ol', 'p', 'pre', 'q', 'small', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'u', 'ul', 'div', 'font', 'span', 'iframe'], attributes: { :all => ['class', 'style'], 'a' => ['href', 'title', 'rel'], 'blockquote' => ['cite'], 'col' => ['span', 'width'], 'colgroup' => ['span', 'width'], 'img' => ['align', 'alt', 'height', 'src', 'title', 'width'], 'ol' => ['start', 'type'], 'q' => ['cite'], 'table' => ['summary', 'width'], 'td' => ['abbr', 'axis', 'colspan', 'rowspan', 'width'], 'th' => ['abbr', 'axis', 'colspan', 'rowspan', 'scope', 'width'], 'ul' => ['type'], 'code' => ['lang', 'language'], 'iframe' => ['height', 'scrolling', 'src', 'width'] }, protocols: { 'a' => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]}, 'blockquote' => {'cite' => ['http', 'https', :relative]}, 'img' => {'src' => ['http', 'https', :relative]}, 'q' => {'cite' => ['http', 'https', :relative]} } } VIDEO_URLS = [ /^http:\/\/(?:www\.)?youtube\.com\/v\//, ] EMBEDDED_VIDEO = lambda do |env| node = env[:node] node_name = node.name.to_s.downcase parent = node.parent # Since the transformer receives the deepest nodes first, we look for a # element or an element whose parent is an . return nil unless (node_name == 'param' || node_name == 'embed') && parent.name.to_s.downcase == 'object' if node_name == 'param' # Quick XPath search to find the node that contains the video URL. return nil unless movie_node = parent.search('param[@name="movie"]')[0] url = movie_node['value'] else # Since this is an , the video URL is in the "src" attribute. No # extra work needed. url = node['src'] end # # Verify that the video URL is actually a valid YouTube video URL. return nil unless VIDEO_URLS.any?{|t| url =~ t} # # We're now certain that this is a YouTube embed, but we still need to run # # it through a special Sanitize step to ensure that no unwanted elements or # # attributes that don't belong in a YouTube embed can sneak in. Sanitize.clean_node!(parent, { :elements => ['embed', 'object', 'param'], attributes: { 'embed' => ['allowfullscreen', 'allowscriptaccess', 'height', 'src', 'type', 'width'], 'object' => ['height', 'width'], 'param' => ['name', 'value'] } }) # Now that we're sure that this is a valid YouTube embed and that there are # no unwanted elements or attributes hidden inside it, we can tell Sanitize # to whitelist the current node ( or ) and its parent # (). {:whitelist_nodes => [node, parent]} end def call data, env data = call_next data, env Sanitize.clean(data, RELAXED.merge( transformers: [EMBEDDED_VIDEO], :add_attributes => { all: [:class] } )) end end