tools/riemann-rabbitmq/bin/riemann-rabbitmq in riemann-tools-1.0.0 vs tools/riemann-rabbitmq/bin/riemann-rabbitmq in riemann-tools-1.1.0

- old
+ new

@@ -1,269 +1,273 @@ #!/usr/bin/env ruby -Process.setproctitle($0) +# frozen_string_literal: true +Process.setproctitle($PROGRAM_NAME) + require 'riemann/tools' -class Riemann::Tools::Rabbitmq - include Riemann::Tools +module Riemann + module Tools + class Rabbitmq + include Riemann::Tools - require 'faraday' - require 'json' - require 'uri' + require 'faraday' + require 'json' + require 'uri' + opt :read_timeout, 'Faraday read timeout', type: :int, default: 2 + opt :open_timeout, 'Faraday open timeout', type: :int, default: 1 - opt :read_timeout, 'Faraday read timeout', type: :int, default: 2 - opt :open_timeout, 'Faraday open timeout', type: :int, default: 1 + opt :monitor_user, 'RabbitMQ monitoring user', type: :string + opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string + opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15_672 + opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: 'localhost' + opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false - opt :monitor_user, 'RabbitMQ monitoring user', type: :string - opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string - opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672 - opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost" - opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false + opt :max_queue_size, 'max number of items in a queue that is acceptable', type: :int, default: 1_000_000 + opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string - opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000 - opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string + opt :node, 'Specify a node to monitor', type: :strings - opt :node, "Specify a node to monitor", type: :strings + def base_url + protocol = 'http' + protocol = 'https' if options[:monitor_use_tls] && (options[:monitor_use_tls] == true) + "#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api" + end - def base_url - protocol = "http" - if (options[:monitor_use_tls]) && (options[:monitor_use_tls]==true) - protocol = "https" - end - "#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api" - end + def overview_url + "#{base_url}/overview" + end - def overview_url - "#{base_url}/overview" - end + def node_url(node) + "#{base_url}/nodes/#{node}" + end - def node_url(n) - "#{base_url}/nodes/#{n}" - end + def queues_url + "#{base_url}/queues" + end - def queues_url - "#{base_url}/queues" - end + def event_host + options[:event_host] || :monitor_host + end - def event_host - if options[:event_host] - return options[:event_host] - else - return options[:monitor_host] - end - end - - def safe_get(uri, event_host) - # Handle connection timeouts - response = nil - begin - connection = Faraday.new(uri) - response = connection.get do |req| - req.options[:timeout] = options[:read_timeout] - req.options[:open_timeout] = options[:open_timeout] + def safe_get(uri, event_host) + # Handle connection timeouts + response = nil + begin + connection = Faraday.new(uri) + response = connection.get do |req| + req.options[:timeout] = options[:read_timeout] + req.options[:open_timeout] = options[:open_timeout] + end + report( + host: event_host, + service: 'rabbitmq monitoring', + state: 'ok', + description: 'Monitoring operational', + ) + rescue StandardError => e + report( + host: event_host, + service: 'rabbitmq monitoring', + state: 'critical', + description: "HTTP connection error: #{e.class} - #{e.message}", + ) end - report(:host => event_host, - :service => "rabbitmq monitoring", - :state => 'ok', - :description => "Monitoring operational" - ) - rescue => e - report(:host => event_host, - :service => "rabbitmq monitoring", - :state => "critical", - :description => "HTTP connection error: #{e.class} - #{e.message}" - ) + response end - response - end - def check_queues - response = safe_get(queues_url, event_host) - max_size_check_filter = if options[:ignore_max_size_queues] - Regexp.new(options[:ignore_max_size_queues]) - else - nil - end + def check_queues + response = safe_get(queues_url, event_host) + max_size_check_filter = (Regexp.new(options[:ignore_max_size_queues]) if options[:ignore_max_size_queues]) - return if response.nil? + return if response.nil? - json = JSON.parse(response.body) + if response.status != 200 + report( + host: event_host, + service: 'rabbitmq.queue', + state: 'critical', + description: "HTTP connection error to /api/queues: #{response.status} - #{response.body}", + ) + else + report( + host: event_host, + service: 'rabbitmq.queue', + state: 'ok', + description: 'HTTP connection ok', + ) - if response.status != 200 - report(:host => event_host, - :service => "rabbitmq.queue", - :state => "critical", - :description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}" - ) - else - report(:host => event_host, - :service => "rabbitmq.queue", - :state => "ok", - :description => "HTTP connection ok" - ) + json = JSON.parse(response.body) - json = JSON.parse(response.body) + json.each do |queue| + svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}" + errs = [] - json.each do |queue| - svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}" - errs = [] + errs << 'Queue has jobs but no consumers' if !queue['messages_ready'].nil? && (queue['messages_ready']).positive? && (queue['consumers']).zero? - if queue['messages_ready']!=nil and queue['messages_ready'] > 0 and queue['consumers'] == 0 - errs << "Queue has jobs but no consumers" - end + errs << "Queue has #{queue['messages_ready']} jobs" if (max_size_check_filter.nil? || queue['name'] !~ (max_size_check_filter)) && !queue['messages_ready'].nil? && (queue['messages_ready'] > options[:max_queue_size]) - if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready']!=nil and queue['messages_ready'] > options[:max_queue_size] - errs << "Queue has #{queue['messages_ready']} jobs" + if errs.empty? + report( + host: event_host, + service: svc, + state: 'ok', + description: 'Queue is looking good', + ) + else + report( + host: event_host, + service: svc, + state: 'critical', + description: errs.join('; '), + ) + end + + stats = (queue['message_stats'] || {}).merge( + 'messages' => queue['messages'], + 'messages_details' => queue['messages_details'], + 'messages_ready' => queue['messages_ready'], + 'messages_ready_details' => queue['messages_ready_details'], + 'messages_unacknowledged' => queue['messages_unacknowledged'], + 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'], + 'consumers' => queue['consumers'], + 'memory' => queue['memory'], + ) + + stats.each_pair do |k, v| + service = "#{svc}.#{k}" + metric = if k =~ (/details$/) && !v.nil? + v['rate'] + else + v + end + + # TODO: Set state via thresholds which can be configured + + report( + host: event_host, + service: service, + metric: metric, + description: 'RabbitMQ monitor', + ) + end + end end + end - if errs.empty? - report(:host => event_host, - :service => svc, - :state => "ok", - :description => "Queue is looking good" + def check_overview + uri = URI(overview_url) + response = safe_get(uri, event_host) + + return if response.nil? + + json = JSON.parse(response.body) + + if response.status != 200 + report( + host: event_host, + service: 'rabbitmq', + state: 'critical', + description: "HTTP connection error: #{response.status} - #{response.body}", ) else - report(:host => event_host, - :service => svc, - :state => "critical", - :description => errs.join("; ") + report( + host: event_host, + service: 'rabbitmq monitoring', + state: 'ok', + description: 'HTTP connection ok', ) - end - stats = (queue['message_stats'] || {}).merge( - 'messages' => queue['messages'], - 'messages_details' => queue['messages_details'], - 'messages_ready' => queue['messages_ready'], - 'messages_ready_details' => queue['messages_ready_details'], - 'messages_unacknowledged' => queue['messages_unacknowledged'], - 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'], - 'consumers' => queue['consumers'], - 'memory' => queue['memory'], - ) + %w[message_stats queue_totals object_totals].each do |stat| + # NOTE: / BUG ? + # Brand new servers can have blank message stats. Is this ok? + # I can't decide. + next if json[stat].empty? - stats.each_pair do |k,v| - service = "#{svc}.#{k}" - if k =~ /details$/ and v!=nil - metric = v['rate'] - else - metric = v - end + json[stat].each_pair do |k, v| + service = "rabbitmq.#{stat}.#{k}" + metric = if k =~ /details$/ + v['rate'] + else + v + end - # TODO: Set state via thresholds which can be configured + # TODO: Set state via thresholds which can be configured - report(:host => event_host, - :service => service, - :metric => metric, - :description => "RabbitMQ monitor" - ) + report( + host: event_host, + service: service, + metric: metric, + description: 'RabbitMQ monitor', + ) + end + end end end - end - end - def check_overview - uri = URI(overview_url) - response = safe_get(uri, event_host) + def check_node + opts[:node].each do |n| + uri = URI(node_url(n)) + response = safe_get(uri, event_host) - return if response.nil? + break if response.nil? - json = JSON.parse(response.body) + if response.status != 200 + if response.status == 404 + report( + host: event_host, + service: "rabbitmq.node.#{n}", + state: 'critical', + description: 'Node was not found in the cluster', + ) + else + report( + host: event_host, + service: "rabbitmq.node.#{n}", + state: 'critical', + description: "HTTP error: #{response.status} - #{response.body}", + ) + end + break + end - if response.status != 200 - report(:host => event_host, - :service => "rabbitmq", - :state => "critical", - :description => "HTTP connection error: #{response.status} - #{response.body}" - ) - else - report(:host => event_host, - :service => "rabbitmq monitoring", - :state => "ok", - :description => "HTTP connection ok" - ) + json = JSON.parse(response.body) - %w( message_stats queue_totals object_totals ).each do |stat| - # NOTE / BUG ? - # Brand new servers can have blank message stats. Is this ok? - # I can't decide. - next if json[stat].empty? - json[stat].each_pair do |k,v| - service = "rabbitmq.#{stat}.#{k}" - if k =~ /details$/ - metric = v['rate'] - else - metric = v + if json['mem_alarm'] + report( + host: event_host, + service: "rabbitmq.node.#{n}", + state: 'critical', + description: 'Memory alarm has triggered; job submission throttled', + ) + break end - # TODO: Set state via thresholds which can be configured + if json['disk_free_alarm'] + report( + host: event_host, + service: "rabbitmq.node.#{n}", + state: 'critical', + description: 'Disk free alarm has triggered; job submission throttled', + ) + break + end - report(:host => event_host, - :service => service, - :metric => metric, - :description => "RabbitMQ monitor" - ) - end - end - end - end - - def check_node - opts[:node].each do |n| - uri = URI(node_url(n)) - response = safe_get(uri, event_host) - - return if response.nil? - - if response.status != 200 - if response.status == 404 - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "Node was not found in the cluster" + report( + host: event_host, + service: "rabbitmq.node.#{n}", + state: 'ok', + description: 'Node looks OK to me', ) - else - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "HTTP error: #{response.status} - #{response.body}" - ) end - return end - json = JSON.parse(response.body) - - if json['mem_alarm'] - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "Memory alarm has triggered; job submission throttled" - ) - return + def tick + check_overview + check_node if opts[:node] + check_queues end - - if json['disk_free_alarm'] - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "Disk free alarm has triggered; job submission throttled" - ) - return - end - - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "ok", - :description => "Node looks OK to me" - ) end end - - def tick - check_overview - check_node if opts[:node] - check_queues - end end Riemann::Tools::Rabbitmq.run -