lib/gitlab_exporter/sidekiq.rb in gitlab-exporter-10.4.0 vs lib/gitlab_exporter/sidekiq.rb in gitlab-exporter-10.5.0

- old
+ new

@@ -8,17 +8,27 @@ # It takes the Redis URL Sidekiq is connected to class SidekiqProber QUEUE_JOB_STATS_SCRIPT = File.read(File.expand_path("#{__FILE__}/../sidekiq_queue_job_stats.lua")).freeze QUEUE_JOB_STATS_SHA = Digest::SHA1.hexdigest(QUEUE_JOB_STATS_SCRIPT).freeze + # The maximum depth (from the head) of each queue to probe. Probing the + # entirety of a very large queue will take longer and run the risk of + # timing out. But when we have a very large queue, we are most in need of + # reliable metrics. This trades off completeness for predictability by + # only taking a limited amount of items from the head of the queue. + PROBE_JOBS_LIMIT = 1_000 + POOL_SIZE = 3 # This timeout is configured to higher interval than scrapping # of Prometheus to ensure that connection is kept instead of # needed to be re-initialized POOL_TIMEOUT = 90 + PrometheusMetrics.describe("sidekiq_enqueued_jobs", + "Total number of jobs enqueued by class name. Only inspects the first #{PROBE_JOBS_LIMIT} jobs per queue.") # rubocop:disable Layout/LineLength + def self.connection_pool @@connection_pool ||= Hash.new do |h, connection_hash| # rubocop:disable Style/ClassVars config = connection_hash.merge(pool_timeout: POOL_TIMEOUT, size: POOL_SIZE) h[connection_hash] = Sidekiq::RedisConnection.create(config) @@ -60,10 +70,17 @@ end self end + # Count worker classes present in Sidekiq queues. This uses a Lua + # script to find all jobs in all queues. That script will block + # all other Redis commands: + # https://redis.io/commands/eval#atomicity-of-scripts + # + # The script is generally fast, but may be slower with very large + # queues, which is why this is not enabled by default. def probe_jobs with_sidekiq do job_stats = {} Sidekiq::Queue.all.each do |queue| @@ -72,9 +89,41 @@ job_stats.merge!(stats.to_h) end rescue Redis::CommandError # Could happen if the script exceeded the maximum run time (5 seconds by default) # FIXME: Should we call SCRIPT KILL? return self + end + + job_stats.each do |class_name, count| + @metrics.add("sidekiq_enqueued_jobs", count, name: class_name) + end + end + + self + end + + # This does the same as #probe_jobs, but only looks at the first + # PROBE_JOBS_LIMIT jobs in each queue. This means that we run a + # single LRANGE command for each queue, which does not block other + # commands. For queues over PROBE_JOBS_LIMIT in size, this means + # that we will not have completely accurate statistics, but the + # probe performance will also not degrade as the queue gets + # larger. + # + # DO NOT USE this and probe_jobs together, as they export the same + # metric (sidekiq_enqueued_jobs). + def probe_jobs_limit + with_sidekiq do + job_stats = Hash.new(0) + + Sidekiq::Queue.all.each do |queue| + Sidekiq.redis do |conn| + conn.lrange("queue:#{queue.name}", 0, PROBE_JOBS_LIMIT).each do |job| + job_class = Sidekiq.load_json(job)["class"] + + job_stats[job_class] += 1 + end + end end job_stats.each do |class_name, count| @metrics.add("sidekiq_enqueued_jobs", count, name: class_name) end