# frozen_string_literal: true
class HTMLPipeline
# A special filter with sanization routines and allowlists. This module defines
# what HTML is allowed in user provided content and fixes up issues with
# unbalanced tags and whatnot.
#
# See the Selma docs for more information on the underlying library:
#
# https://github.com/gjtorikian/selma/#readme
#
# This filter does not write additional information to the context.
class SanitizationFilter
VALID_PROTOCOLS = Selma::Sanitizer::Config::VALID_PROTOCOLS.dup
# The main sanitization allowlist. Only these elements and attributes are
# allowed through by default.
DEFAULT_CONFIG = Selma::Sanitizer::Config.freeze_config({
elements: [
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"br",
"b",
"i",
"strong",
"em",
"a",
"pre",
"code",
"img",
"tt",
"div",
"ins",
"del",
"sup",
"sub",
"p",
"picture",
"ol",
"ul",
"table",
"thead",
"tbody",
"tfoot",
"blockquote",
"dl",
"dt",
"dd",
"kbd",
"q",
"samp",
"var",
"hr",
"ruby",
"rt",
"rp",
"li",
"tr",
"td",
"th",
"s",
"strike",
"summary",
"details",
"caption",
"figure",
"figcaption",
"abbr",
"bdo",
"cite",
"dfn",
"mark",
"small",
"source",
"span",
"time",
"wbr",
],
attributes: {
"a" => ["href"],
"img" => ["src", "longdesc", "loading", "alt"],
"div" => ["itemscope", "itemtype"],
"blockquote" => ["cite"],
"del" => ["cite"],
"ins" => ["cite"],
"q" => ["cite"],
"source" => ["srcset"],
all: [
"abbr",
"accept",
"accept-charset",
"accesskey",
"action",
"align",
"alt",
"aria-describedby",
"aria-hidden",
"aria-label",
"aria-labelledby",
"axis",
"border",
"char",
"charoff",
"charset",
"checked",
"clear",
"cols",
"colspan",
"compact",
"coords",
"datetime",
"dir",
"disabled",
"enctype",
"for",
"frame",
"headers",
"height",
"hreflang",
"hspace",
"id",
"ismap",
"label",
"lang",
"maxlength",
"media",
"method",
"multiple",
"name",
"nohref",
"noshade",
"nowrap",
"open",
"progress",
"prompt",
"readonly",
"rel",
"rev",
"role",
"rows",
"rowspan",
"rules",
"scope",
"selected",
"shape",
"size",
"span",
"start",
"summary",
"tabindex",
"title",
"type",
"usemap",
"valign",
"value",
"width",
"itemprop",
],
},
protocols: {
"a" => { "href" => Selma::Sanitizer::Config::VALID_PROTOCOLS }.freeze,
"blockquote" => { "cite" => ["http", "https", :relative].freeze },
"del" => { "cite" => ["http", "https", :relative].freeze },
"ins" => { "cite" => ["http", "https", :relative].freeze },
"q" => { "cite" => ["http", "https", :relative].freeze },
"img" => {
"src" => ["http", "https", :relative].freeze,
"longdesc" => ["http", "https", :relative].freeze,
},
},
})
class << self
def call(html, config)
raise ArgumentError, "html must be a String, not #{html.class}" unless html.is_a?(String)
raise ArgumentError, "config must be a Hash, not #{config.class}" unless config.is_a?(Hash)
sanitization_config = Selma::Sanitizer.new(config)
Selma::Rewriter.new(sanitizer: sanitization_config).rewrite(html)
end
end
end
end