sidekiq.rb in gitlab-exporter-10.5.0

- old
+ new

@@ -8,17 +8,27 @@
     # It takes the Redis URL Sidekiq is connected to
     class SidekiqProber
       QUEUE_JOB_STATS_SCRIPT = File.read(File.expand_path("#{__FILE__}/../sidekiq_queue_job_stats.lua")).freeze
       QUEUE_JOB_STATS_SHA    = Digest::SHA1.hexdigest(QUEUE_JOB_STATS_SCRIPT).freeze
 
+      # The maximum depth (from the head) of each queue to probe. Probing the
+      # entirety of a very large queue will take longer and run the risk of
+      # timing out. But when we have a very large queue, we are most in need of
+      # reliable metrics. This trades off completeness for predictability by
+      # only taking a limited amount of items from the head of the queue.
+      PROBE_JOBS_LIMIT = 1_000
+
       POOL_SIZE = 3
 
       # This timeout is configured to higher interval than scrapping
       # of Prometheus to ensure that connection is kept instead of
       # needed to be re-initialized
       POOL_TIMEOUT = 90
 
+      PrometheusMetrics.describe("sidekiq_enqueued_jobs",
+                                 "Total number of jobs enqueued by class name. Only inspects the first #{PROBE_JOBS_LIMIT} jobs per queue.") # rubocop:disable Layout/LineLength
+
       def self.connection_pool
         @@connection_pool ||= Hash.new do |h, connection_hash| # rubocop:disable Style/ClassVars
           config = connection_hash.merge(pool_timeout: POOL_TIMEOUT, size: POOL_SIZE)
 
           h[connection_hash] = Sidekiq::RedisConnection.create(config)
@@ -60,10 +70,17 @@
         end
 
         self
       end
 
+      # Count worker classes present in Sidekiq queues. This uses a Lua
+      # script to find all jobs in all queues. That script will block
+      # all other Redis commands:
+      # https://redis.io/commands/eval#atomicity-of-scripts
+      #
+      # The script is generally fast, but may be slower with very large
+      # queues, which is why this is not enabled by default.
       def probe_jobs
         with_sidekiq do
           job_stats = {}
 
           Sidekiq::Queue.all.each do |queue|
@@ -72,9 +89,41 @@
               job_stats.merge!(stats.to_h)
             end
           rescue Redis::CommandError # Could happen if the script exceeded the maximum run time (5 seconds by default)
             # FIXME: Should we call SCRIPT KILL?
             return self
+          end
+
+          job_stats.each do |class_name, count|
+            @metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
+          end
+        end
+
+        self
+      end
+
+      # This does the same as #probe_jobs, but only looks at the first
+      # PROBE_JOBS_LIMIT jobs in each queue. This means that we run a
+      # single LRANGE command for each queue, which does not block other
+      # commands. For queues over PROBE_JOBS_LIMIT in size, this means
+      # that we will not have completely accurate statistics, but the
+      # probe performance will also not degrade as the queue gets
+      # larger.
+      #
+      # DO NOT USE this and probe_jobs together, as they export the same
+      # metric (sidekiq_enqueued_jobs).
+      def probe_jobs_limit
+        with_sidekiq do
+          job_stats = Hash.new(0)
+
+          Sidekiq::Queue.all.each do |queue|
+            Sidekiq.redis do |conn|
+              conn.lrange("queue:#{queue.name}", 0, PROBE_JOBS_LIMIT).each do |job|
+                job_class = Sidekiq.load_json(job)["class"]
+
+                job_stats[job_class] += 1
+              end
+            end
           end
 
           job_stats.each do |class_name, count|
             @metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
           end