begin
require "sanitize"
rescue LoadError => _
raise HTML::Pipeline::MissingDependencyError, "Missing dependency 'sanitize' for SanitizationFilter. See README.md for details."
end
module HTML
class Pipeline
# HTML filter with sanization routines and whitelists. This module defines
# what HTML is allowed in user provided content and fixes up issues with
# unbalanced tags and whatnot.
#
# See the Sanitize docs for more information on the underlying library:
#
# https://github.com/rgrove/sanitize/#readme
#
# Context options:
# :whitelist - The sanitizer whitelist configuration to use. This
# can be one of the options constants defined in this
# class or a custom sanitize options hash.
# :anchor_schemes - The URL schemes to allow in attributes. The
# default set is provided in the ANCHOR_SCHEMES
# constant in this class. If passed, this overrides any
# schemes specified in the whitelist configuration.
#
# This filter does not write additional information to the context.
class SanitizationFilter < Filter
LISTS = Set.new(%w(ul ol).freeze)
LIST_ITEM = 'li'.freeze
# List of table child elements. These must be contained by a
element
# or they are not allowed through. Otherwise they can be used to break out
# of places we're using tables to contain formatted user content (like pull
# request review comments).
TABLE_ITEMS = Set.new(%w(tr td th).freeze)
TABLE = 'table'.freeze
TABLE_SECTIONS = Set.new(%w(thead tbody tfoot).freeze)
# These schemes are the only ones allowed in attributes by default.
ANCHOR_SCHEMES = ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac'].freeze
# The main sanitization whitelist. Only these elements and attributes are
# allowed through by default.
WHITELIST = {
:elements => %w(
h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
div ins del sup sub p ol ul table thead tbody tfoot blockquote
dl dt dd kbd q samp var hr ruby rt rp li tr td th s strike summary details
),
:remove_contents => ['script'],
:attributes => {
'a' => ['href'],
'img' => ['src', 'longdesc'],
'div' => ['itemscope', 'itemtype'],
'blockquote' => ['cite'],
'del' => ['cite'],
'ins' => ['cite'],
'q' => ['cite'],
:all => ['abbr', 'accept', 'accept-charset',
'accesskey', 'action', 'align', 'alt', 'axis',
'border', 'cellpadding', 'cellspacing', 'char',
'charoff', 'charset', 'checked',
'clear', 'cols', 'colspan', 'color',
'compact', 'coords', 'datetime', 'dir',
'disabled', 'enctype', 'for', 'frame',
'headers', 'height', 'hreflang',
'hspace', 'ismap', 'label', 'lang',
'maxlength', 'media', 'method',
'multiple', 'name', 'nohref', 'noshade',
'nowrap', 'open', 'prompt', 'readonly', 'rel', 'rev',
'rows', 'rowspan', 'rules', 'scope',
'selected', 'shape', 'size', 'span',
'start', 'summary', 'tabindex', 'target',
'title', 'type', 'usemap', 'valign', 'value',
'vspace', 'width', 'itemprop']
},
:protocols => {
'a' => {'href' => ANCHOR_SCHEMES},
'blockquote' => {'cite' => ['http', 'https', :relative]},
'del' => {'cite' => ['http', 'https', :relative]},
'ins' => {'cite' => ['http', 'https', :relative]},
'q' => {'cite' => ['http', 'https', :relative]},
'img' => {
'src' => ['http', 'https', :relative],
'longdesc' => ['http', 'https', :relative]
}
},
:transformers => [
# Top-level elements are removed because they can break out of
# containing markup.
lambda { |env|
name, node = env[:node_name], env[:node]
if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
node.replace(node.children)
end
},
# Table child elements that are not contained by a are removed.
lambda { |env|
name, node = env[:node_name], env[:node]
if (TABLE_SECTIONS.include?(name) || TABLE_ITEMS.include?(name)) && !node.ancestors.any? { |n| n.name == TABLE }
node.replace(node.children)
end
}
]
}
# A more limited sanitization whitelist. This includes all attributes,
# protocols, and transformers from WHITELIST but with a more locked down
# set of allowed elements.
LIMITED = WHITELIST.merge(
:elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
# Strip all HTML tags from the document.
FULL = { :elements => [] }
# Sanitize markup using the Sanitize library.
def call
Sanitize.clean_node!(doc, whitelist)
end
# The whitelist to use when sanitizing. This can be passed in the context
# hash to the filter but defaults to WHITELIST constant value above.
def whitelist
whitelist = context[:whitelist] || WHITELIST
anchor_schemes = context[:anchor_schemes]
return whitelist unless anchor_schemes
whitelist = whitelist.dup
whitelist[:protocols] = (whitelist[:protocols] || {}).dup
whitelist[:protocols]['a'] = (whitelist[:protocols]['a'] || {}).merge('href' => anchor_schemes)
whitelist
end
end
end
end