# frozen_string_literal: true

class HTMLPipeline
  # A special filter with sanization routines and allowlists. This module defines
  # what HTML is allowed in user provided content and fixes up issues with
  # unbalanced tags and whatnot.
  #
  # See the Selma docs for more information on the underlying library:
  #
  # https://github.com/gjtorikian/selma/#readme
  #
  # This filter does not write additional information to the context.
  class SanitizationFilter
    VALID_PROTOCOLS = Selma::Sanitizer::Config::VALID_PROTOCOLS.dup

    # The main sanitization allowlist. Only these elements and attributes are
    # allowed through by default.
    DEFAULT_CONFIG = Selma::Sanitizer::Config.freeze_config({
      elements: [
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "br",
        "b",
        "i",
        "strong",
        "em",
        "a",
        "pre",
        "code",
        "img",
        "tt",
        "div",
        "ins",
        "del",
        "sup",
        "sub",
        "p",
        "picture",
        "ol",
        "ul",
        "table",
        "thead",
        "tbody",
        "tfoot",
        "blockquote",
        "dl",
        "dt",
        "dd",
        "kbd",
        "q",
        "samp",
        "var",
        "hr",
        "ruby",
        "rt",
        "rp",
        "li",
        "tr",
        "td",
        "th",
        "s",
        "strike",
        "summary",
        "details",
        "caption",
        "figure",
        "figcaption",
        "abbr",
        "bdo",
        "cite",
        "dfn",
        "mark",
        "small",
        "source",
        "span",
        "time",
        "wbr",
      ],

      attributes: {
        "a" => ["href"],
        "img" => ["src", "longdesc", "loading", "alt"],
        "div" => ["itemscope", "itemtype"],
        "blockquote" => ["cite"],
        "del" => ["cite"],
        "ins" => ["cite"],
        "q" => ["cite"],
        "source" => ["srcset"],
        all: [
          "abbr",
          "accept",
          "accept-charset",
          "accesskey",
          "action",
          "align",
          "alt",
          "aria-describedby",
          "aria-hidden",
          "aria-label",
          "aria-labelledby",
          "axis",
          "border",
          "char",
          "charoff",
          "charset",
          "checked",
          "clear",
          "cols",
          "colspan",
          "compact",
          "coords",
          "datetime",
          "dir",
          "disabled",
          "enctype",
          "for",
          "frame",
          "headers",
          "height",
          "hreflang",
          "hspace",
          "id",
          "ismap",
          "label",
          "lang",
          "maxlength",
          "media",
          "method",
          "multiple",
          "name",
          "nohref",
          "noshade",
          "nowrap",
          "open",
          "progress",
          "prompt",
          "readonly",
          "rel",
          "rev",
          "role",
          "rows",
          "rowspan",
          "rules",
          "scope",
          "selected",
          "shape",
          "size",
          "span",
          "start",
          "summary",
          "tabindex",
          "title",
          "type",
          "usemap",
          "valign",
          "value",
          "width",
          "itemprop",
        ],
      },
      protocols: {
        "a" => { "href" => Selma::Sanitizer::Config::VALID_PROTOCOLS }.freeze,
        "blockquote" => { "cite" => ["http", "https", :relative].freeze },
        "del" => { "cite" => ["http", "https", :relative].freeze },
        "ins" => { "cite" => ["http", "https", :relative].freeze },
        "q" => { "cite" => ["http", "https", :relative].freeze },
        "img" => {
          "src" => ["http", "https", :relative].freeze,
          "longdesc" => ["http", "https", :relative].freeze,
        },
      },
    })

    class << self
      def call(html, config)
        raise ArgumentError, "html must be a String, not #{html.class}" unless html.is_a?(String)
        raise ArgumentError, "config must be a Hash, not #{config.class}" unless config.is_a?(Hash)

        sanitization_config = Selma::Sanitizer.new(config)
        Selma::Rewriter.new(sanitizer: sanitization_config).rewrite(html)
      end
    end
  end
end