bin/check-rds-events.rb in sensu-plugins-aws-2.1.0 vs bin/check-rds-events.rb in sensu-plugins-aws-2.1.1

- old
+ new

@@ -3,14 +3,17 @@ # check-rds-events # # # DESCRIPTION: # This plugin checks rds clusters for critical events. -# Due to the number of events types on RDS clusters the check searches for -# events containing the text string 'has started' or 'is being'. These events all have -# accompanying completiion events and are impacting events +# Due to the number of events types on RDS clusters, the check +# should filter out non-disruptive events that are part of +# basic operations. # +# More info on RDS events: +# http://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_Events.html +# # OUTPUT: # plain-text # # PLATFORMS: # Linux @@ -74,17 +77,25 @@ begin # fetch all clusters identifiers clusters = rds.describe_db_instances[:db_instances].map { |db| db[:db_instance_identifier] } maint_clusters = [] - # fetch the last 2 hours of events for each cluster + # fetch the last 15 minutes of events for each cluster + # that way, we're only spammed with persistent notifications that we'd care about. clusters.each do |cluster_name| - events_record = rds.describe_events(start_time: (Time.now - 7200).iso8601, source_type: 'db-instance', source_identifier: cluster_name) + events_record = rds.describe_events(start_time: (Time.now - 900).iso8601, source_type: 'db-instance', source_identifier: cluster_name) next if events_record[:events].empty? - # if the last event is a start maint event then the cluster is still in maint + # we will need to filter out non-disruptive/basic operation events. + # ie. the regular backup operations + next if events_record[:events][-1][:message] =~ /Backing up DB instance|Finished DB Instance backup|Restored from snapshot/ + # ie. Replication resumed + next if events_record[:events][-1][:message] =~ /Replication for the Read Replica resumed/ + # you can add more filters to skip more events. + + # draft the messages cluster_name_long = "#{cluster_name} (#{aws_config[:region]}) #{events_record[:events][-1][:message]}" - maint_clusters.push(cluster_name_long) if events_record[:events][-1][:message] =~ /has started|is being|off-line|shutdown/ + maint_clusters.push(cluster_name_long) end rescue => e unknown "An error occurred processing AWS RDS API: #{e.message}" end maint_clusters