bin/check-marathon-task.rb in sensu-plugins-mesos-0.1.1 vs bin/check-marathon-task.rb in sensu-plugins-mesos-1.0.0
- old
+ new
@@ -31,50 +31,146 @@
require 'sensu-plugin/check/cli'
require 'net/http'
require 'json'
+# This plugin checks that the given Mesos/Marathon task is running properly.
+#
+# This means that all of the following is true:
+# 1. There are N tasks for the app, as defined by the --instances parameter
+# 2. Each task's state is running
+# 3. No task is unhealthy, as defined in Marathon
+#
+# A task is seen as **unhealthy** by Marathon if any of the health checks for
+# the task is not **alive**. Alive means that a check has a last success that
+# is more recent than last failure. It's not alive if the last failure is more
+# recent than the last success, or if the last success doesn't exist at all.
class MarathonTaskCheck < Sensu::Plugin::Check::CLI
check_name 'CheckMarathonTask'
- option :server, short: '-s SERVER', long: '--server SERVER', required: true
- option :port, short: '-p PORT', long: '--port PORT', default: 8080
- option :task, short: '-t TASK', long: '--task TASK', required: true
- option :instances, short: '-i INSTANCES', long: '--instances INSTANCES', required: true, proc: proc(&:to_i)
+ option :server,
+ short: '-s SERVER',
+ long: '--server SERVER',
+ required: true
+ option :port,
+ short: '-p PORT',
+ long: '--port PORT',
+ default: 8080
+
+ option :uri,
+ description: 'Endpoint URI',
+ short: '-u URI',
+ long: '--uri URI',
+ default: '/v2/tasks?status=running'
+
+ option :task,
+ short: '-t TASK',
+ long: '--task TASK',
+ required: true
+
+ option :instances,
+ short: '-i INSTANCES',
+ long: '--instances INSTANCES',
+ required: true,
+ proc: proc(&:to_i)
+
+ option :protocol,
+ short: '-P PROTOCOL',
+ long: '--protocol PROTOCOL',
+ required: false,
+ default: 'http'
+
+ option :username,
+ short: '-u USERNAME',
+ long: '--username USERNAME',
+ required: false
+
+ option :password,
+ long: '--password PASSWORD',
+ required: false
+
def run
- if config[:instances] == 0
+ if config[:instances].zero?
unknown 'number of instances should be an integer'
end
+ if !config[:username].nil? && config[:password].nil? ||
+ config[:username].nil? && !config[:password].nil?
+ unknown 'You must provide both username and password'
+ end
+
failures = []
+ uri = config[:uri]
config[:server].split(',').each do |s|
begin
- url = URI.parse("http://#{s}:#{config[:port]}/v2/tasks?state=running")
+ url = URI.parse("#{config[:protocol]}://#{s}:#{config[:port]}#{uri}")
req = Net::HTTP::Get.new(url)
req.add_field('Accept', 'application/json')
- r = Net::HTTP.new(url.host, url.port).start do |h|
+ if !config[:username].nil? && !config[:password].nil?
+ req.basic_auth(config[:username], config[:password])
+ end
+ r = Net::HTTP.start(url.host, url.port,
+ use_ssl: config[:protocol] == 'https') do |h|
h.request(req)
end
- tasks = JSON.parse(r.body)['tasks']
- tasks.select! do |t|
- t['appId'] == "/#{config[:task]}"
- end
+ ok_count, unhealthy = check_tasks r.body
- message = "#{tasks.length}/#{config[:instances]} #{config[:task]} tasks running"
+ message = "#{ok_count}/#{config[:instances]} #{config[:task]} tasks running"
- if tasks.length < config[:instances]
+ if unhealthy.any?
+ message << ":\n" << unhealthy.join("\n")
+ end
+
+ if unhealthy.any? || ok_count < config[:instances]
critical message
end
ok message
rescue Errno::ECONNREFUSED, SocketError
failures << "Marathon on #{s} could not be reached"
- rescue
- failures << "error caught trying to reach Marathon on #{s}"
+ rescue => err
+ failures << "error caught trying to reach Marathon on #{s}: #{err}"
end
end
unknown "marathon task state could not be retrieved:\n" << failures.join("\n")
+ end
+
+ # Parses JSON data as returned from Marathon's tasks API
+ # @param data [String] Server response
+ # @return [Numeric, [String]] Number of running tasks and a list of error
+ # messages from unhealthy tasks
+ def check_tasks(data)
+ begin
+ tasks = JSON.parse(data)['tasks']
+ rescue JSON::ParserError
+ raise "Could not parse JSON response: #{data}"
+ end
+
+ if tasks.nil?
+ raise "No tasks in server response: #{data}"
+ end
+
+ tasks.select! do |t|
+ t['appId'] == "/#{config[:task]}"
+ end
+
+ unhealthy = []
+
+ # Collect last error message for all health checks that are not alive
+ tasks.each do |task|
+ checks = task['healthCheckResults'] || []
+ checks.each do |check|
+ if check['alive']
+ next
+ end
+ message = check['lastFailureCause'] ||
+ 'Health check not alive'
+ unhealthy << message
+ end
+ end
+
+ [tasks.length, unhealthy]
end
end