bin/check-emr-steps.rb in sensu-plugins-aws-2.3.0 vs bin/check-emr-steps.rb in sensu-plugins-aws-2.4.0

- old
+ new

@@ -6,17 +6,16 @@ # Alerts on any failed steps for a cluster in the past 10 minutes. # # OUTPUT: # plain-text # -# # DEPENDENCIES: # gem: aws-sdk # gem: sensu-plugin # # USAGE: -# check-emr-steps.rb -r us-west-2 -b 'My Cluster' +# check-emr-steps.rb -r us-west-2 -b 'My Cluster' -t FAILED -c 0 # # This will alert on any failed steps in the past 10 minutes on the latest cluster # with the name 'My Cluster'. # NOTES: # @@ -27,11 +26,11 @@ require 'sensu-plugins-aws' require 'sensu-plugin/metric/cli' require 'aws-sdk' -class CheckEMRSteps < Sensu::Plugin::Metric::CLI::Graphite +class CheckEMRSteps < Sensu::Plugin::Check::CLI include Common option :scheme, description: 'Metric naming scheme, text to prepend to metric', short: '-s SCHEME', @@ -48,31 +47,43 @@ short: '-b CLUSTER_NAME', long: '--cluster-name', description: 'The name of the EMR cluster', required: true + option :status, + short: '-t STEP_STATUS', + long: '--step-status', + description: 'Step status to check, [PENDING RUNNING COMPLETED CANCELLED FAILED INTERRUPTED]', + default: 'FAILED' + + option :count, + short: '-c COUNT', + long: '--count', + description: 'Max number of steps with this status.', + default: 0 + def run emr = Aws::EMR::Client.new(aws_config) begin emr_clusters = emr.list_clusters.clusters clusters = emr_clusters.select { |c| c.name == config[:cluster_name] } critical "EMR cluster #{config[:cluster_name]} not found" if clusters.empty? cluster = clusters.sort_by { |c| c.status.timeline.creation_date_time }.reverse.first steps = emr.list_steps( cluster_id: cluster.id, - step_states: ['FAILED'] + step_states: config[:status] ).steps messages = [] now = Time.new failed = steps.select { |step| now - step.status.timeline.end_date_time < 10 * 60 } - failed.each_entry { |step| messages << "Step #{step.id} '#{step.name}' has failed on cluster #{cluster.id} '#{cluster.name}'" } - - if messages.count > 0 - critical("#{messages.count} #{messages.count > 1 ? 'steps have' : 'step has'} failed: #{messages.join(',')}") - else - ok + if failed.size > config[:count] + failed.each_entry { |step| messages << "Step #{step.id} '#{step.name}' has failed on cluster #{cluster.id} '#{cluster.name}'" } + if messages.count > 0 + critical("#{messages.count} #{messages.count > 1 ? 'steps have' : 'step has'} failed: #{messages.join(',')}") + end end + ok end end end