bin/check-marathon-task.rb in sensu-plugins-mesos-0.1.1 vs bin/check-marathon-task.rb in sensu-plugins-mesos-1.0.0

- old
+ new

@@ -31,50 +31,146 @@ require 'sensu-plugin/check/cli' require 'net/http' require 'json' +# This plugin checks that the given Mesos/Marathon task is running properly. +# +# This means that all of the following is true: +# 1. There are N tasks for the app, as defined by the --instances parameter +# 2. Each task's state is running +# 3. No task is unhealthy, as defined in Marathon +# +# A task is seen as **unhealthy** by Marathon if any of the health checks for +# the task is not **alive**. Alive means that a check has a last success that +# is more recent than last failure. It's not alive if the last failure is more +# recent than the last success, or if the last success doesn't exist at all. class MarathonTaskCheck < Sensu::Plugin::Check::CLI check_name 'CheckMarathonTask' - option :server, short: '-s SERVER', long: '--server SERVER', required: true - option :port, short: '-p PORT', long: '--port PORT', default: 8080 - option :task, short: '-t TASK', long: '--task TASK', required: true - option :instances, short: '-i INSTANCES', long: '--instances INSTANCES', required: true, proc: proc(&:to_i) + option :server, + short: '-s SERVER', + long: '--server SERVER', + required: true + option :port, + short: '-p PORT', + long: '--port PORT', + default: 8080 + + option :uri, + description: 'Endpoint URI', + short: '-u URI', + long: '--uri URI', + default: '/v2/tasks?status=running' + + option :task, + short: '-t TASK', + long: '--task TASK', + required: true + + option :instances, + short: '-i INSTANCES', + long: '--instances INSTANCES', + required: true, + proc: proc(&:to_i) + + option :protocol, + short: '-P PROTOCOL', + long: '--protocol PROTOCOL', + required: false, + default: 'http' + + option :username, + short: '-u USERNAME', + long: '--username USERNAME', + required: false + + option :password, + long: '--password PASSWORD', + required: false + def run - if config[:instances] == 0 + if config[:instances].zero? unknown 'number of instances should be an integer' end + if !config[:username].nil? && config[:password].nil? || + config[:username].nil? && !config[:password].nil? + unknown 'You must provide both username and password' + end + failures = [] + uri = config[:uri] config[:server].split(',').each do |s| begin - url = URI.parse("http://#{s}:#{config[:port]}/v2/tasks?state=running") + url = URI.parse("#{config[:protocol]}://#{s}:#{config[:port]}#{uri}") req = Net::HTTP::Get.new(url) req.add_field('Accept', 'application/json') - r = Net::HTTP.new(url.host, url.port).start do |h| + if !config[:username].nil? && !config[:password].nil? + req.basic_auth(config[:username], config[:password]) + end + r = Net::HTTP.start(url.host, url.port, + use_ssl: config[:protocol] == 'https') do |h| h.request(req) end - tasks = JSON.parse(r.body)['tasks'] - tasks.select! do |t| - t['appId'] == "/#{config[:task]}" - end + ok_count, unhealthy = check_tasks r.body - message = "#{tasks.length}/#{config[:instances]} #{config[:task]} tasks running" + message = "#{ok_count}/#{config[:instances]} #{config[:task]} tasks running" - if tasks.length < config[:instances] + if unhealthy.any? + message << ":\n" << unhealthy.join("\n") + end + + if unhealthy.any? || ok_count < config[:instances] critical message end ok message rescue Errno::ECONNREFUSED, SocketError failures << "Marathon on #{s} could not be reached" - rescue - failures << "error caught trying to reach Marathon on #{s}" + rescue => err + failures << "error caught trying to reach Marathon on #{s}: #{err}" end end unknown "marathon task state could not be retrieved:\n" << failures.join("\n") + end + + # Parses JSON data as returned from Marathon's tasks API + # @param data [String] Server response + # @return [Numeric, [String]] Number of running tasks and a list of error + # messages from unhealthy tasks + def check_tasks(data) + begin + tasks = JSON.parse(data)['tasks'] + rescue JSON::ParserError + raise "Could not parse JSON response: #{data}" + end + + if tasks.nil? + raise "No tasks in server response: #{data}" + end + + tasks.select! do |t| + t['appId'] == "/#{config[:task]}" + end + + unhealthy = [] + + # Collect last error message for all health checks that are not alive + tasks.each do |task| + checks = task['healthCheckResults'] || [] + checks.each do |check| + if check['alive'] + next + end + message = check['lastFailureCause'] || + 'Health check not alive' + unhealthy << message + end + end + + [tasks.length, unhealthy] end end