#!/usr/bin/env ruby # frozen_string_literal: false # # Check Aggregate # === # # Authors # === # Sean Porter, @portertech # # Copyright 2012 Sonian, Inc. # # Released under the same terms as Sensu (the MIT license); see # LICENSE for details. require 'sensu-plugin/check/cli' require 'rest-client' require 'json' class CheckAggregate < Sensu::Plugin::Check::CLI option :api, short: '-a URL', long: '--api URL', description: 'Sensu API URL', default: if ENV['SENSU_API'] ENV['SENSU_API'] + ':4567' elsif ENV['SENSU_API_URL'] ENV['SENSU_API_URL'] else 'http://localhost:4567' end option :insecure, short: '-k', boolean: true, description: 'Enabling insecure connections', default: false option :user, short: '-u USER', long: '--user USER', description: 'Sensu API USER' option :password, short: '-p PASSWORD', long: '--password PASSWORD', description: 'Sensu API PASSWORD' option :timeout, short: '-t SECONDS', long: '--timeout SECONDS', description: 'Sensu API connection timeout in SECONDS', proc: proc(&:to_i), default: 30 option :check, short: '-c CHECK', long: '--check CHECK', description: 'Aggregate CHECK name', required: true option :age, short: '-A SECONDS', long: '--age SECONDS', description: 'Minimum aggregate age in SECONDS, time since check request issued', default: 30, proc: proc(&:to_i) option :limit, short: '-l LIMIT', long: '--limit LIMIT', description: 'Limit of aggregates you want the API to return', proc: proc(&:to_i) option :summarize, short: '-s', long: '--summarize', boolean: true, description: 'Summarize check result output', default: false option :collect_output, short: '-o', long: '--output', boolean: true, description: 'Collects all non-ok outputs', default: false option :warning, short: '-W PERCENT', long: '--warning PERCENT', description: 'PERCENT warning before warning (can be change with --ignore-severity)', proc: proc(&:to_i) option :warning_count, long: '--warning_count INTEGER', description: 'number of nodes in warning before warning (can be change with --ignore-severity)', proc: proc(&:to_i) option :critical, short: '-C PERCENT', long: '--critical PERCENT', description: 'PERCENT critical before critical (can be change with --ignore-severity)', proc: proc(&:to_i) option :critical_count, long: '--critical_count INTEGER', description: 'number of node in critical before critical (can be change with --ignore-severity)', proc: proc(&:to_i) option :pattern, short: '-P PATTERN', long: '--pattern PATTERN', description: 'A PATTERN to detect outliers' option :honor_stash, short: '-i', long: '--honor-stash', description: 'Checks that are stashed will be ignored from the aggregate', boolean: true, default: false option :message, short: '-M MESSAGE', long: '--message MESSAGE', description: 'A custom error MESSAGE' option :ignore_severity, long: '--ignore-severity', description: 'Ignore severities, all non-ok will count for critical, critical_count, warning and warning_count option', boolean: true, default: false option :debug, short: '-D', long: '--debug', description: 'Display results hash at end of output message', boolean: true, default: false option :stale_percentage, long: '--stale-percentage PERCENT', description: 'PERCENT stale before warning', proc: proc(&:to_i) option :stale_count, long: '--stale-count INTEGER', description: 'number of nodes with stale data before warning', proc: proc(&:to_i) def api_request(resource) verify_mode = OpenSSL::SSL::VERIFY_PEER verify_mode = OpenSSL::SSL::VERIFY_NONE if config[:insecure] request = RestClient::Resource.new(config[:api] + resource, timeout: config[:timeout], user: config[:user], password: config[:password], verify_ssl: verify_mode) JSON.parse(request.get, symbolize_names: true) rescue Errno::ECONNREFUSED warning 'Connection refused' rescue RestClient::RequestFailed warning 'Request failed' rescue RestClient::RequestTimeout warning 'Connection timed out' rescue RestClient::Unauthorized warning 'Missing or incorrect Sensu API credentials' rescue JSON::ParserError warning 'Sensu API returned invalid JSON' end def honor_stash(aggregate) aggregate[:results].delete_if do |entry| begin api_request("/stashes/silence/#{entry[:client]}/#{config[:check]}") if entry[:status].zero? aggregate[:ok] = aggregate[:ok] - 1 elsif entry[:status] == 1 aggregate[:warning] = aggregate[:warning] - 1 elsif entry[:status] == 2 aggregate[:critical] = aggregate[:critical] - 1 else aggregate[:unknown] = aggregate[:unknown] - 1 end aggregate[:total] = aggregate[:total] - 1 true rescue RestClient::ResourceNotFound false end end aggregate end def collect_output(aggregate) output = '' aggregate[:results].each do |entry| output << entry[:output] + "\n" unless entry[:status].zero? end aggregate[:outputs] = [output] end def acquire_aggregate major, minor = api_request('/info')[:sensu][:version].split('.') if major >= '1' || minor >= '24' named_aggregate_results else aggregate_results end end def named_aggregate_results results = api_request("/aggregates/#{config[:check]}?max_age=#{config[:age]}")[:results] warning "No aggregates found in last #{config[:age]} seconds" if %w[ok warning critical unknown].all? { |x| results[x.to_sym].zero? } results end def aggregate_results uri = "/aggregates/#{config[:check]}" issued = api_request(uri + "?age=#{config[:age]}" + (config[:limit] ? "&limit=#{config[:limit]}" : '')) unless issued.empty? issued_sorted = issued.sort time = issued_sorted.pop unless time.nil? uri += "/#{time}?" uri += '&summarize=output' if config[:summarize] uri += '&results=true' if config[:honor_stash] || config[:collect_output] api_request(uri) else warning "No aggregates older than #{config[:age]} seconds" end else warning "No aggregates for #{config[:check]}" end end def compare_thresholds(aggregate) message = config[:message] || 'Number of non-zero results exceeds threshold' message += ' (%d%% %s)' message += "\n" + aggregate[:outputs] if aggregate[:outputs] if config[:debug] message += "\n" + aggregate.to_s end if config[:ignore_severity] percent_non_zero = (100 - (aggregate[:ok].to_f / aggregate[:total].to_f) * 100).to_i if config[:critical] && percent_non_zero >= config[:critical] critical format(message, percent_non_zero, 'non-zero') elsif config[:warning] && percent_non_zero >= config[:warning] warning format(message, percent_non_zero, 'non-zero') end else percent_warning = (aggregate[:warning].to_f / aggregate[:total].to_f * 100).to_i percent_critical = (aggregate[:critical].to_f / aggregate[:total].to_f * 100).to_i if config[:critical] && percent_critical >= config[:critical] critical format(message, percent_critical, 'critical') elsif config[:warning] && percent_warning >= config[:warning] warning format(message, percent_warning, 'warning') end end end def compare_pattern(aggregate) regex = Regexp.new(config[:pattern]) mappings = {} message = config[:message] || 'One of these is not like the others!' if config[:debug] message += "\n" + aggregate.to_s end aggregate[:outputs].each_key do |output| matched = regex.match(output.to_s) unless matched.nil? key = matched[1] value = matched[2..-1] if mappings.key?(key) unless mappings[key] == value critical message + " (#{key})" end end mappings[key] = value end end end def compare_thresholds_count(aggregate) message = config[:message] || 'Number of nodes down exceeds threshold' message += " (%s out of #{aggregate[:total]} nodes reporting %s)" message += "\n" + aggregate[:outputs] if aggregate[:outputs] if config[:debug] message += "\n" + aggregate.to_s end if config[:ignore_severity] number_of_nodes_reporting_down = aggregate[:total].to_i - aggregate[:ok].to_i if config[:critical_count] && number_of_nodes_reporting_down >= config[:critical_count] critical format(message, number_of_nodes_reporting_down, 'not ok') elsif config[:warning_count] && number_of_nodes_reporting_down >= config[:warning_count] warning format(message, number_of_nodes_reporting_down, 'not ok') end else nodes_reporting_warning = aggregate[:warning].to_i nodes_reporting_critical = aggregate[:critical].to_i if config[:critical_count] && nodes_reporting_critical >= config[:critical_count] critical format(message, nodes_reporting_critical, 'critical') elsif config[:warning_count] && nodes_reporting_warning >= config[:warning_count] warning format(message, nodes_reporting_warning, 'warning') end end end def compare_stale(aggregate) message = config[:message] || 'Number of stale results exceeds threshold' message += " (%s out of #{aggregate[:total]} nodes reporting %s)" message += "\n" + aggregate[:outputs] if aggregate[:outputs] if config[:stale_percentage] percent_stale = (aggregate[:stale].to_f / aggregate[:total].to_f * 100).to_i if percent_stale >= config[:stale_percentage] warning format(message, percent_stale.to_s + '%', 'stale') end elsif config[:stale_count] if aggregate[:stale] >= config[:stale_count] warning format(message, aggregate[:stale].to_s, 'stale') end end end def run threshold = config[:critical] || config[:warning] threshold_count = config[:critical_count] || config[:warning_count] pattern = config[:summarize] && config[:pattern] critical 'Misconfiguration: critical || warning || (summarize && pattern) must be set' unless threshold || pattern || threshold_count aggregate = acquire_aggregate aggregate = honor_stash(aggregate) if config[:honor_stash] aggregate = collect_output(aggregate) if config[:collect_output] compare_thresholds(aggregate) if threshold compare_pattern(aggregate) if pattern compare_thresholds_count(aggregate) if threshold_count compare_stale(aggregate) if config[:stale_percentage] || config[:stale_count] if config[:debug] ok "Aggregate looks GOOD\n" + aggregate.to_s else ok 'Aggregate looks Good' end end end