require 'prometheus/client'

require 'frankenstein/error'

module Frankenstein
  # A common pattern for statistical instrumentation is to capture a few basic
  # numbers for all incoming and outgoing requests to the service.  Since this
  # is a common pattern, we can abstract that behaviour into a common class,
  # which simplifies the external interface for maintaining statistics in this
  # common case.
  #
  # For more information on this pattern, see
  # https://honeycomb.io/blog/2017/01/instrumentation-the-first-four-things-you-measure/
  #
  class Request
    # No block was passed to #measure.
    class NoBlockError < Frankenstein::Error; end

    # Create a new request instrumentation package.
    #
    # A "request", for the purposes of this discussion, is a distinct
    # interaction with an external system, typically either the receipt of some
    # sort of communication from another system which needs a response by this
    # system (the one being instrumented), or else the communication to another
    # system from this one for which we are expecting an answer.  Each instance
    # of this class should be used to instrument all requests of a particular
    # type.
    #
    # For each instance of this class, the following metrics will be created:
    #
    # * `<prefix>_requests_total` -- a counter indicating the total number
    #   of requests started (initiated or received by the system).  Labels on
    #   this metric are taken from the label set passed to #measure.
    #
    # * `<prefix>_request_duration_seconds` -- a histogram for the response
    #   times of successful responses (that is, where no exception was raised).
    #   You can get the count of total successful responses from
    #   `<prefix>_request_duration_seconds_count`.  Labels on this metric
    #   are taken from the labels set generated during the measured run (as
    #   generated by manipulating the hash yielded to your block).
    #
    # * `<prefix>_exceptions_total` -- a count of the number of exceptions
    #   raised during processing.  A label, `class`, indicates the class of
    #   the exception raised.  Labels on this metric are taken from the
    #   label set passed to #measure, along with a special label `class`
    #   to indicate the class of the exception raised.
    #
    # * `<prefix>_in_progress_count` -- a gauge indicating how many requests
    #   are currently in progress as at the time of the scrape.  Labels on this
    #   metric are taken from the label set passed to #measure.
    #
    # @param prefix [#to_s] the string that will be prepended to all of the
    #   Prometheus metric names generated for this instrumentation.  The prefix
    #   you choose should include both the application name (typically the
    #   first word) as well as a unique identifier for the request type itself.
    #   Multiple words should be underscore separated.
    #
    # @param outgoing [Boolean] whether this Request instance is collecting
    #   data on incoming requests or outgoing requests (the default, as usually
    #   there is one incoming request handler, but there can easily be several
    #   outgoing request types).  It is only used to customise the metric
    #   description text for the metrics, so it's not crucially important.
    #
    # @param description [#to_s] a short explanation of what this is measuring.
    #   It should be a singular and indefinite noun phrase, to maximise the
    #   chances that it will fit neatly into the generated description text.
    #
    # @param labels [Array<Symbol>] the names of the labels that will be
    #   set on most of the metrics created by this instance.
    #
    # @param duration_labels [Array<Symbol>] the names of the labels which will
    #   be set on the `_duration_seconds` histogram.  By default, the value
    #   of the `labels` parameter will be used, but if you want to have a separate
    #   label set for your duration histogram, this is what you want.
    #
    # @param registry [Prometheus::Client::Registry] the client registry in
    #   which all the metrics will be created.  The default will put all the
    #   metrics in the Prometheus Client's default registry, which may or may
    #   not be what you're up for.  If you're using Frankenstein::Server, you
    #   want `stats_server.registry`.
    #
    def initialize(prefix, labels: [], duration_labels: nil, outgoing: true, description: prefix, registry: Prometheus::Client.registry)
      @requests   = registry.counter(:"#{prefix}_requests_total", docstring: "Number of #{description} requests #{outgoing ? 'sent' : 'received'}", labels: labels)
      @durations  = registry.histogram(:"#{prefix}_request_duration_seconds", docstring: "Time taken to #{outgoing ? 'receive' : 'send'} a #{description} response", labels: duration_labels || labels)
      @exceptions = registry.counter(:"#{prefix}_exceptions_total", docstring: "Number of exceptions raised by the #{description} code", labels: labels + [:class])
      @current    = registry.gauge(:"#{prefix}_in_progress_count", docstring: "Number of #{description} requests currently in progress", labels: labels)

      # Prometheus::Client::Gauge doesn't (yet) have a built-in way to
      # atomically "adjust" a gauge, only get the current value and set a
      # new value.  To avoid the resulting textbook race condition, we
      # need to wrap the get/set pair of operations in this handy-dandy
      # mutex.
      @mutex = Mutex.new
    end

    # Instrument an instance of the request.
    #
    # Each time a particular external communication occurs, it should be
    # wrapped by a call to this method.  Request-related statistics (that
    # the request has been made or received) are updated before the passed
    # block is executed, and then after the block completes,
    # response-related statistics (duration or exception) are recorded.  The
    # number of currently-in-progress instances of the request are also kept
    # track of.
    #
    # @param labels [Hash] a set of labels that can help to differentiate
    #   different sorts of requests.  These labels are applied to the
    #   `<prefix>_requests_total` and `<prefix>_in_progress_count` metrics, as
    #   well as the `<prefix>_exceptions_total` metric, if an exception is
    #   raised.
    #
    #   Don't get too fancy with this label set -- it's unusual that this is
    #   actually useful in practice.  However it is provided for those unusual
    #   cases where it isn't a bad idea.  Your go-to solution should be to
    #   label the `<prefix>_request_duration_seconds` metric (by modifying the
    #   hash yielded to the block you pass to #measure), rather than using
    #   this parameter with wild abandon.
    #
    #   Serious talk time: I've been there.  It seems like a great idea at
    #   first, to differentiate requests with lots of labels, but it usually
    #   just ends up turning into a giant mess.  Primarily, due to the way that
    #   Prometheus deals with label sets, if you *ever* use a label on *any* of
    #   your requests, you need to set the same label to some value on *all* of
    #   your requests.  So, unless you can say with certainty that every
    #   request you receive will logically have some meaningful value for a
    #   given label, you shouldn't use it.
    #
    #   **NOTE**: the labelset you specify here will be the default labelset
    #   applied to the `<prefix>_request_duration_seconds` metric.  If you need
    #   to remove a label from the response, use `labels.replace` or
    #   `labels.delete` to remove the key.
    #
    # @yield [Hash] the labels that will be applied to the
    #   `<Prefix>_request_duration_seconds` metric.
    #
    #   In order for your label set to be applied, you must *mutate the
    #   hash that is yielded*, rather than overwriting it.  That means,
    #   for example, that the following code **will not work**:
    #
    #       req_stats.measure do |labels|
    #         labels = {foo: 'bar', baz: 'wombat'}
    #         ...
    #
    #   Instead, you need to either set each key one by one, or use the
    #   handy-dandy Hash#replace method, like this:
    #
    #       req_stats.measure do |labels|
    #         labels.replace(foo: 'bar', baz: 'wombat')
    #         ...
    #
    #   If your labels are not being applied to your response histogram,
    #   check for any assignment to the yielded variable.  It's *really* easy
    #   to do by mistake.
    #
    #   **NOTE WELL**: The Prometheus specification (assuming it exists)
    #   apparently requires that all of the instances of a given metric
    #   have the same set of labels.  If you fail to do this, an exception
    #   will be raised by Prometheus after the block is executed.
    #
    # @raise [Frankenstein::Request::NoBlockError] if you didn't pass a block to
    #   call.  There's nothing to instrument!
    #
    # @raise [Prometheus::Client::LabelSetValidator::LabelSetError] if you
    #   violate any written or unwritten rules about how Prometheus label
    #   sets should be constructed.
    #
    # @raise [Exception] any exception raised by the executed block will
    #   be re-raised by this method after statistics collection is
    #   complete.
    #
    # @return [Object] whatever was returned by the block passed.
    #
    def measure(labels = {})
      start_time = Time.now

      unless block_given?
        raise NoBlockError,
          "No block passed to #{self.class}#measure"
      end

      @requests.increment(labels: labels)
      @mutex.synchronize { @current.set((@current.get(labels: labels) || 0) + 1, labels: labels) }

      res_labels = labels.dup

      begin
        yield(res_labels).tap do
          elapsed_time = Time.now - start_time
          @durations.observe(elapsed_time, labels: res_labels)
        end
      rescue Exception => ex
        @exceptions.increment(labels: labels.merge(class: ex.class.to_s))
        raise
      ensure
        @mutex.synchronize { @current.set(@current.get(labels: labels) - 1, labels: labels) }
      end
    end
  end
end