lib/gitlab_exporter/sidekiq.rb in gitlab-exporter-10.4.0 vs lib/gitlab_exporter/sidekiq.rb in gitlab-exporter-10.5.0
- old
+ new
@@ -8,17 +8,27 @@
# It takes the Redis URL Sidekiq is connected to
class SidekiqProber
QUEUE_JOB_STATS_SCRIPT = File.read(File.expand_path("#{__FILE__}/../sidekiq_queue_job_stats.lua")).freeze
QUEUE_JOB_STATS_SHA = Digest::SHA1.hexdigest(QUEUE_JOB_STATS_SCRIPT).freeze
+ # The maximum depth (from the head) of each queue to probe. Probing the
+ # entirety of a very large queue will take longer and run the risk of
+ # timing out. But when we have a very large queue, we are most in need of
+ # reliable metrics. This trades off completeness for predictability by
+ # only taking a limited amount of items from the head of the queue.
+ PROBE_JOBS_LIMIT = 1_000
+
POOL_SIZE = 3
# This timeout is configured to higher interval than scrapping
# of Prometheus to ensure that connection is kept instead of
# needed to be re-initialized
POOL_TIMEOUT = 90
+ PrometheusMetrics.describe("sidekiq_enqueued_jobs",
+ "Total number of jobs enqueued by class name. Only inspects the first #{PROBE_JOBS_LIMIT} jobs per queue.") # rubocop:disable Layout/LineLength
+
def self.connection_pool
@@connection_pool ||= Hash.new do |h, connection_hash| # rubocop:disable Style/ClassVars
config = connection_hash.merge(pool_timeout: POOL_TIMEOUT, size: POOL_SIZE)
h[connection_hash] = Sidekiq::RedisConnection.create(config)
@@ -60,10 +70,17 @@
end
self
end
+ # Count worker classes present in Sidekiq queues. This uses a Lua
+ # script to find all jobs in all queues. That script will block
+ # all other Redis commands:
+ # https://redis.io/commands/eval#atomicity-of-scripts
+ #
+ # The script is generally fast, but may be slower with very large
+ # queues, which is why this is not enabled by default.
def probe_jobs
with_sidekiq do
job_stats = {}
Sidekiq::Queue.all.each do |queue|
@@ -72,9 +89,41 @@
job_stats.merge!(stats.to_h)
end
rescue Redis::CommandError # Could happen if the script exceeded the maximum run time (5 seconds by default)
# FIXME: Should we call SCRIPT KILL?
return self
+ end
+
+ job_stats.each do |class_name, count|
+ @metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
+ end
+ end
+
+ self
+ end
+
+ # This does the same as #probe_jobs, but only looks at the first
+ # PROBE_JOBS_LIMIT jobs in each queue. This means that we run a
+ # single LRANGE command for each queue, which does not block other
+ # commands. For queues over PROBE_JOBS_LIMIT in size, this means
+ # that we will not have completely accurate statistics, but the
+ # probe performance will also not degrade as the queue gets
+ # larger.
+ #
+ # DO NOT USE this and probe_jobs together, as they export the same
+ # metric (sidekiq_enqueued_jobs).
+ def probe_jobs_limit
+ with_sidekiq do
+ job_stats = Hash.new(0)
+
+ Sidekiq::Queue.all.each do |queue|
+ Sidekiq.redis do |conn|
+ conn.lrange("queue:#{queue.name}", 0, PROBE_JOBS_LIMIT).each do |job|
+ job_class = Sidekiq.load_json(job)["class"]
+
+ job_stats[job_class] += 1
+ end
+ end
end
job_stats.each do |class_name, count|
@metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
end