bin/check-emr-steps.rb in sensu-plugins-aws-2.3.0 vs bin/check-emr-steps.rb in sensu-plugins-aws-2.4.0
- old
+ new
@@ -6,17 +6,16 @@
# Alerts on any failed steps for a cluster in the past 10 minutes.
#
# OUTPUT:
# plain-text
#
-#
# DEPENDENCIES:
# gem: aws-sdk
# gem: sensu-plugin
#
# USAGE:
-# check-emr-steps.rb -r us-west-2 -b 'My Cluster'
+# check-emr-steps.rb -r us-west-2 -b 'My Cluster' -t FAILED -c 0
#
# This will alert on any failed steps in the past 10 minutes on the latest cluster
# with the name 'My Cluster'.
# NOTES:
#
@@ -27,11 +26,11 @@
require 'sensu-plugins-aws'
require 'sensu-plugin/metric/cli'
require 'aws-sdk'
-class CheckEMRSteps < Sensu::Plugin::Metric::CLI::Graphite
+class CheckEMRSteps < Sensu::Plugin::Check::CLI
include Common
option :scheme,
description: 'Metric naming scheme, text to prepend to metric',
short: '-s SCHEME',
@@ -48,31 +47,43 @@
short: '-b CLUSTER_NAME',
long: '--cluster-name',
description: 'The name of the EMR cluster',
required: true
+ option :status,
+ short: '-t STEP_STATUS',
+ long: '--step-status',
+ description: 'Step status to check, [PENDING RUNNING COMPLETED CANCELLED FAILED INTERRUPTED]',
+ default: 'FAILED'
+
+ option :count,
+ short: '-c COUNT',
+ long: '--count',
+ description: 'Max number of steps with this status.',
+ default: 0
+
def run
emr = Aws::EMR::Client.new(aws_config)
begin
emr_clusters = emr.list_clusters.clusters
clusters = emr_clusters.select { |c| c.name == config[:cluster_name] }
critical "EMR cluster #{config[:cluster_name]} not found" if clusters.empty?
cluster = clusters.sort_by { |c| c.status.timeline.creation_date_time }.reverse.first
steps = emr.list_steps(
cluster_id: cluster.id,
- step_states: ['FAILED']
+ step_states: config[:status]
).steps
messages = []
now = Time.new
failed = steps.select { |step| now - step.status.timeline.end_date_time < 10 * 60 }
- failed.each_entry { |step| messages << "Step #{step.id} '#{step.name}' has failed on cluster #{cluster.id} '#{cluster.name}'" }
-
- if messages.count > 0
- critical("#{messages.count} #{messages.count > 1 ? 'steps have' : 'step has'} failed: #{messages.join(',')}")
- else
- ok
+ if failed.size > config[:count]
+ failed.each_entry { |step| messages << "Step #{step.id} '#{step.name}' has failed on cluster #{cluster.id} '#{cluster.name}'" }
+ if messages.count > 0
+ critical("#{messages.count} #{messages.count > 1 ? 'steps have' : 'step has'} failed: #{messages.join(',')}")
+ end
end
+ ok
end
end
end