# frozen_string_literal: true require "zeitwerk" lib_dir = File.join(File.dirname(__dir__), "lib") gem_loader = Zeitwerk::Loader.for_gem gem_loader.inflector.inflect( "html_pipeline" => "HTMLPipeline", ) gem_loader.ignore(File.join(lib_dir, "html-pipeline.rb")) gem_loader.setup if ENV.fetch("DEBUG", false) require "amazing_print" require "debug" end class HTMLPipeline # HTML processing filters and utilities. This module includes a small # framework for defining DOM based content filters and applying them to user # provided content. # # See HTMLPipeline::Filter for information on building filters. # # Construct a Pipeline for running multiple HTML filters. A pipeline is created once # with one to many filters, and it then can be `call`ed many times over the course # of its lifetime with input. # # filters - Array of Filter objects. Each must respond to call(doc, # context). Filters are performed in the order provided. # default_context - The default context hash. Values specified here will be merged # into values from the each individual pipeline run. Can NOT be # nil. Default: empty Hash. # result_class - The default Class of the result object for individual # calls. Default: Hash. Protip: Pass in a Struct to get # some semblance of type safety. class MissingDependencyError < RuntimeError; end class InvalidFilterError < ArgumentError; end class << self def optional_dependency(name, requirer) require name rescue LoadError # rubocop:disable Lint/SuppressedException: end def require_dependency(name, requirer) require name rescue LoadError => e raise MissingDependencyError, "Missing dependency '#{name}' for #{requirer}. See README.md for details.\n#{e.class.name}: #{e}" end def require_dependencies(names, requirer) dependency_list = names.dup loaded = false while !loaded && names.length > 1 name = names.shift begin require_dependency(name, requirer) loaded = true # we got a dependency define_dependency_loaded_method(name, true) # try the next dependency rescue MissingDependencyError define_dependency_loaded_method(name, false) end end return if loaded begin name = names.shift require name define_dependency_loaded_method(name, true) rescue LoadError => e raise MissingDependencyError, "Missing all dependencies '#{dependency_list.join(", ")}' for #{requirer}. See README.md for details.\n#{e.class.name}: #{e}" end end def define_dependency_loaded_method(name, value) self.class.define_method(:"#{name}_loaded?", -> { value }) end end # Public: Returns an Array of Filter objects for this Pipeline. attr_reader :text_filters, :node_filters # Public: A hash representing the sanitization configuration settings attr_reader :sanitization_config # Public: Instrumentation service for the pipeline. # Set an ActiveSupport::Notifications compatible object to enable. attr_accessor :instrumentation_service # Public: String name for this Pipeline. Defaults to Class name. attr_writer :instrumentation_name def instrumentation_name return @instrumentation_name if defined?(@instrumentation_name) @instrumentation_name = self.class.name end class << self # Public: Default instrumentation service for new pipeline objects. attr_accessor :default_instrumentation_service end def initialize(text_filters: [], convert_filter: nil, sanitization_config: SanitizationFilter::DEFAULT_CONFIG, node_filters: [], default_context: {}, result_class: Hash) raise ArgumentError, "default_context cannot be nil" if default_context.nil? @text_filters = text_filters.flatten.freeze validate_filters(@text_filters, HTMLPipeline::TextFilter) @node_filters = node_filters.flatten.freeze validate_filters(@node_filters, HTMLPipeline::NodeFilter) @convert_filter = convert_filter if @convert_filter.nil? && !@node_filters.empty? raise InvalidFilterError, "Must provide `convert_filter` if `node_filter`s is also provided" elsif !@convert_filter.nil? validate_filter(@convert_filter, HTMLPipeline::ConvertFilter) end @sanitization_config = sanitization_config.nil? ? nil : Selma::Sanitizer.new(sanitization_config) @default_context = default_context.freeze @instrumentation_service = self.class.default_instrumentation_service end # Apply all filters in the pipeline to the given HTML. # # html - A UTF-8 String comprised of HTML. # context - The context hash passed to each filter. See the Filter docs # for more info on possible values. This object MUST NOT be modified # in place by filters. Use the Result for passing state back. # result - The result Hash passed to each filter for modification. This # is where Filters store extracted information from the content. # # Returns the result Hash after being filtered by this Pipeline. Contains an # :output key with the String HTML markup based on the # output of the last filter in the pipeline. def call(text, context: {}, result: {}) context = @default_context.merge(context) context = context.freeze result ||= {} payload = default_payload({ text_filters: @text_filters.map(&:name), context: context, result: result, }) instrument("call_text_filters.html_pipeline", payload) do result[:output] = @text_filters.inject(text) do |doc, filter| perform_filter(filter, doc, context: context, result: result) end end text = result[:output] html = @convert_filter.call(text) unless @convert_filter.nil? unless @node_filters.empty? payload = default_payload({ node_filters: @node_filters.map { |f| f.class.name }, context: context, result: result, }) instrument("call_node_filters.html_pipeline", payload) do result[:output] = Selma::Rewriter.new(sanitizer: @sanitization_config, handlers: @node_filters).rewrite(html) end end result = result.merge(Hash[*@node_filters.collect(&:result).flatten]) @node_filters.each(&:reset!) result end # Internal: Applies a specific filter to the supplied doc. # # The filter is instrumented. # # Returns the result of the filter. def perform_filter(filter, doc, context: {}, result: {}) payload = default_payload({ filter: filter.name, context: context, result: result, }) instrument("call_filter.html_pipeline", payload) do filter.call(doc, context: context, result: result) end end # Like call but guarantee the value returned is a string of HTML markup. def to_html(input, context: {}, result: {}) result = call(input, context: context, result: result) output = result[:output] if output.respond_to?(:to_html) output.to_html else output.to_s end end # Public: setup instrumentation for this pipeline. # # Returns nothing. def setup_instrumentation(name, service: nil) self.instrumentation_name = name self.instrumentation_service = service || self.class.default_instrumentation_service end # Internal: if the `instrumentation_service` object is set, instruments the # block, otherwise the block is ran without instrumentation. # # Returns the result of the provided block. def instrument(event, payload = {}, &block) payload ||= default_payload return yield(payload) unless instrumentation_service instrumentation_service.instrument(event, payload, &block) end # Internal: Default payload for instrumentation. # # Accepts a Hash of additional payload data to be merged. # # Returns a Hash. def default_payload(payload = {}) { pipeline: instrumentation_name }.merge(payload) end private def validate_filter(filter, klass) unless correctly_ancestored?(filter, klass) raise InvalidFilterError, "Filter must inherit from `#{klass}`; #{filter} does not" end end private def validate_filters(filters, klass) return if filters.nil? || filters.empty? invalid_filters = filters.reject { |f| correctly_ancestored?(f, klass) } unless invalid_filters.empty? verb = invalid_filters.count == 1 ? "does" : "do" raise InvalidFilterError, "All filters must inherit from `#{klass}`; #{invalid_filters.join(", ")} #{verb} not" end end private def correctly_ancestored?(filter, klass) if filter.respond_to?(:ancestors) filter.ancestors.include?(klass) else filter.class.ancestors.include?(klass) end end end