# frozen_string_literal: true module Karafka module Web module Tracking module Consumers module Listeners # Listener used to collect metrics published by librdkafka class Statistics < Base # Collect Kafka metrics # # @param event [Karafka::Core::Monitoring::Event] def on_statistics_emitted(event) statistics = event[:statistics] topics = statistics.fetch('topics') cgrp = statistics.fetch('cgrp') cg_id = event[:consumer_group_id] sg_id = event[:subscription_group_id] sg_details = extract_sg_details(sg_id, cgrp) # More than one subscription group from the same consumer group may be reporting # almost the same time. To prevent corruption of partial data, we put everything here # in track as we merge data from multiple subscription groups track do |sampler| topics.each do |topic_name, topic_values| partitions = topic_values.fetch('partitions') partitions.each do |pt_name, pt_stats| pt_id = pt_name.to_i next unless partition_reportable?(pt_id, pt_stats) metrics = extract_partition_metrics(pt_stats) next if metrics.empty? topics_details = sg_details[:topics] topic_details = topics_details[topic_name] ||= { name: topic_name, partitions: {} } topic_details[:partitions][pt_id] = metrics.merge( id: pt_id, # Pauses are stored on a consumer group since we do not process same topic # twice in the multiple subscription groups poll_state: poll_state(cg_id, topic_name, pt_id) ) end end sampler.consumer_groups[cg_id] ||= { id: cg_id, subscription_groups: {} } sampler.consumer_groups[cg_id][:subscription_groups][sg_id] = sg_details end end private # Extracts basic consumer group related details # @param sg_id [String] # @param sg_stats [Hash] # @return [Hash] consumer group relevant details def extract_sg_details(sg_id, sg_stats) { id: sg_id, state: sg_stats.slice( 'state', 'join_state', 'stateage', 'rebalance_age', 'rebalance_cnt', 'rebalance_reason' ), topics: {} } end # @param pt_id [Integer] # @param pt_stats [Hash] # @return [Boolean] is this partition relevant to the current process, hence should we # report about it in the context of the process. def partition_reportable?(pt_id, pt_stats) return false if pt_id == -1 # Collect information only about what we are subscribed to and what we fetch or # work in any way. Stopped means, we stopped working with it return false if pt_stats['fetch_state'] == 'stopped' # Return if we no longer fetch this partition in a particular process. None means # that we no longer have this subscription assigned and we do not fetch return false if pt_stats['fetch_state'] == 'none' true end # Extracts and formats partition relevant metrics # # @param pt_stats [Hash] # @return [Hash] extracted partition metrics def extract_partition_metrics(pt_stats) metrics = pt_stats.slice( 'consumer_lag', 'consumer_lag_d', 'consumer_lag_stored', 'consumer_lag_stored_d', 'committed_offset', # Can be useful to track the frequency of flushes when there is progress 'committed_offset_fd', 'stored_offset', # Can be useful to track the frequency of flushes when there is progress 'stored_offset_fd', 'fetch_state', 'hi_offset', 'hi_offset_fd', 'lo_offset', 'eof_offset', 'ls_offset', # Two below can be useful for detection of hanging transactions 'ls_offset_d', 'ls_offset_fd' ) # Rename as we do not need `consumer_` prefix metrics.transform_keys! { |key| key.gsub('consumer_', '') } metrics.transform_keys!(&:to_sym) metrics end # @param cg_id [String] # @param topic_name [String] # @param pt_id [Integer] # @return [String] poll state / is partition paused or not def poll_state(cg_id, topic_name, pt_id) pause_id = [cg_id, topic_name, pt_id].join('-') sampler.pauses.include?(pause_id) ? 'paused' : 'active' end end end end end end end