# frozen_string_literal: true
class HTMLPipeline
# A special filter with sanization routines and allowlists. This module defines
# what HTML is allowed in user provided content and fixes up issues with
# unbalanced tags and whatnot.
#
# See the Selma docs for more information on the underlying library:
#
# https://github.com/gjtorikian/selma/#readme
#
# This filter does not write additional information to the context.
class SanitizationFilter
VALID_PROTOCOLS = Selma::Sanitizer::Config::VALID_PROTOCOLS.dup
# The main sanitization allowlist. Only these elements and attributes are
# allowed through by default.
DEFAULT_CONFIG = Selma::Sanitizer::Config.freeze_config({
elements: ["h1", "h2", "h3", "h4", "h5", "h6", "br", "b", "i", "strong", "em", "a", "pre", "code",
"img", "tt", "div", "ins", "del", "sup", "sub", "p", "picture", "ol", "ul", "table", "thead", "tbody", "tfoot",
"blockquote", "dl", "dt", "dd", "kbd", "q", "samp", "var", "hr", "ruby", "rt", "rp", "li", "tr", "td", "th",
"s", "strike", "summary", "details", "caption", "figure", "figcaption", "abbr", "bdo", "cite",
"dfn", "mark", "small", "source", "span", "time", "wbr",],
attributes: {
"a" => ["href"],
"img" => ["src", "longdesc", "loading", "alt"],
"div" => ["itemscope", "itemtype"],
"blockquote" => ["cite"],
"del" => ["cite"],
"ins" => ["cite"],
"q" => ["cite"],
"source" => ["srcset"],
all: ["abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", "aria-describedby",
"aria-hidden", "aria-label", "aria-labelledby", "axis", "border", "char",
"charoff", "charset", "checked", "clear", "cols", "colspan", "compact", "coords", "datetime", "dir",
"disabled", "enctype", "for", "frame", "headers", "height", "hreflang", "hspace", "id", "ismap", "label", "lang",
"maxlength", "media", "method", "multiple", "name", "nohref", "noshade", "nowrap", "open", "progress",
"prompt", "readonly", "rel", "rev", "role", "rows", "rowspan", "rules", "scope", "selected", "shape",
"size", "span", "start", "summary", "tabindex", "title", "type", "usemap", "valign", "value", "width", "itemprop",],
},
protocols: {
"a" => { "href" => Selma::Sanitizer::Config::VALID_PROTOCOLS }.freeze,
"blockquote" => { "cite" => ["http", "https", :relative].freeze },
"del" => { "cite" => ["http", "https", :relative].freeze },
"ins" => { "cite" => ["http", "https", :relative].freeze },
"q" => { "cite" => ["http", "https", :relative].freeze },
"img" => {
"src" => ["http", "https", :relative].freeze,
"longdesc" => ["http", "https", :relative].freeze,
},
},
})
class << self
def call(html, config)
raise ArgumentError, "html must be a String, not #{html.class}" unless html.is_a?(String)
raise ArgumentError, "config must be a Hash, not #{config.class}" unless config.is_a?(Hash)
sanitization_config = Selma::Sanitizer.new(config)
Selma::Rewriter.new(sanitizer: sanitization_config).rewrite(html)
end
end
end
end