lib/flapjack/executive.rb in flapjack-0.7.1 vs lib/flapjack/executive.rb in flapjack-0.7.2

- old
+ new

@@ -78,36 +78,50 @@ # FIXME: all of the below keys assume there is only ever one executive running; # we could generate a fuid and save it to disk, and prepend it from that # point on... - # TODO unset on exit? - @redis.set('boot_time', @boot_time.to_i) - # FIXME: add an administrative function to reset all event counters if @redis.hget('event_counters', 'all').nil? @redis.hset('event_counters', 'all', 0) @redis.hset('event_counters', 'ok', 0) @redis.hset('event_counters', 'failure', 0) @redis.hset('event_counters', 'action', 0) end - @redis.zadd('executive_instances', @boot_time.to_i, @instance_id) + #@redis.zadd('executive_instances', @boot_time.to_i, @instance_id) + @redis.hset("executive_instance:#{@instance_id}", 'boot_time', @boot_time.to_i) @redis.hset("event_counters:#{@instance_id}", 'all', 0) @redis.hset("event_counters:#{@instance_id}", 'ok', 0) @redis.hset("event_counters:#{@instance_id}", 'failure', 0) @redis.hset("event_counters:#{@instance_id}", 'action', 0) + touch_keys end + # expire instance keys after one week + # TODO: set up a separate EM timer to reset key expiry every minute + # and reduce the expiry to, say, five minutes + # TODO: remove these keys on process exit + def touch_keys + [ "executive_instance:#{@instance_id}", + "event_counters:#{@instance_id}", + "event_counters:#{@instance_id}", + "event_counters:#{@instance_id}", + "event_counters:#{@instance_id}" ].each {|key| + @redis.expire(key, 1036800) + } + end + def start @logger.info("Booting main loop.") until @should_quit @logger.debug("Waiting for event...") event = Flapjack::Data::Event.next(:redis => @redis, :archive_events => @archive_events, - :events_archive_maxage => @events_archive_maxage) + :events_archive_maxage => @events_archive_maxage, + :logger => @logger) process_event(event) unless event.nil? end @logger.info("Exiting main loop.") end @@ -155,10 +169,14 @@ @logger.info("Generating notifications for event #{event.id}, #{event.type}, #{event.state}, #{event.summary}#{time_at_str}") generate_notification_messages(event, entity_check) end def update_keys(event, entity_check) + + # TODO: run touch_keys from a separate EM timer for efficiency + touch_keys + result = { :skip_filters => false } timestamp = Time.now.to_i @event_count = @redis.hincrby('event_counters', 'all', 1) @redis.hincrby("event_counters:#{@instance_id}", 'all', 1) @@ -241,26 +259,33 @@ notification_type = 'acknowledgement' when 'test_notifications' notification_type = 'test' end end + + max_notified_severity = entity_check.max_notified_severity_of_current_failure + @redis.set("#{event.id}:last_#{notification_type}_notification", timestamp) @redis.set("#{event.id}:last_#{event.state}_notification", timestamp) if event.failure? @redis.rpush("#{event.id}:#{notification_type}_notifications", timestamp) @redis.rpush("#{event.id}:#{event.state}_notifications", timestamp) if event.failure? @logger.debug("Notification of type #{notification_type} is being generated for #{event.id}.") contacts = entity_check.contacts if contacts.empty? + @logger.debug("No contacts for #{event.id}") @notifylog.info("#{Time.now.to_s} | #{event.id} | #{notification_type} | NO CONTACTS") return end - notification = Flapjack::Data::Notification.for_event(event, :type => notification_type) + notification = Flapjack::Data::Notification.for_event( + event, :type => notification_type, :max_notified_severity => max_notified_severity) - enqueue_messages( apply_notification_rules( notification.messages(:contacts => contacts) ) ) + messages = notification.messages(:contacts => contacts) + messages = apply_notification_rules(messages) + enqueue_messages(messages) end # time restrictions match? # nil rule.time_restrictions matches @@ -289,12 +314,11 @@ @logger.debug "apply_notification_rules: got messages with size #{messages.size}" # don't consider notification rules if the contact has none tuple = messages.map do |message| - @logger.debug "considering message: #{message.medium} #{message.notification.event.id} #{message.notification.event.state}" - @logger.debug "contact_id: #{message.contact.id}" + @logger.debug "considering message for contact: #{message.contact.id} #{message.medium} #{message.notification.event.id} #{message.notification.event.state}" rules = message.contact.notification_rules @logger.debug "found #{rules.length} rules for this message's contact" event_id = message.notification.event.id options = {} options[:no_rules_for_contact] = true if rules.empty? @@ -335,19 +359,36 @@ @logger.debug "apply_notification_rules: num messages after removing blackhole matches: #{tuple.size}" # delete any media that doesn't meet severity<->media constraints tuple = tuple.find_all do |message, matchers, options| - severity = message.notification.event.state + state = message.notification.event.state + max_notified_severity = message.notification.max_notified_severity + + # use EntityCheck#max_notified_severity_of_current_failure + # as calculated prior to updating the last_notification* keys + # if it's a higher severity than the current state + severity = 'ok' + case + when ([state, max_notified_severity] & ['critical', 'unknown']).any? + severity = 'critical' + when [state, max_notified_severity].include?('warning') + severity = 'warning' + end options[:no_rules_for_contact] || matchers.any? {|matcher| - matcher.media_for_severity(severity).include?(message.medium) || - (@logger.warn("got nil for matcher.media_for_severity(#{severity}), matcher: #{matcher.inspect}") && false) + mms = matcher.media_for_severity(severity) + unless mms + answer = false + else + answer = mms.include?(message.medium) + end + answer } end - @logger.debug "apply_notification_rules: num messages after severity-media constraints: #{tuple.size}" + @logger.debug "apply_notification_rules: num messages after pruning for severity-media constraints: #{tuple.size}" # delete media based on notification interval tuple = tuple.find_all do |message, matchers, options| not message.contact.drop_notifications?(:media => message.medium, :check => message.notification.event.id, @@ -376,13 +417,26 @@ return end @logger.info("Enqueueing #{media_type} alert for #{event_id} to #{message.address}") - message.contact.update_sent_alert_keys(:media => message.medium, - :check => message.notification.event.id, - :state => message.notification.event.state) - # drop_alerts_for_contact:#{self.id}:#{media}:#{check}:#{state} + if message.notification.event.state == 'ok' + message.contact.update_sent_alert_keys( + :media => message.medium, + :check => message.notification.event.id, + :state => 'warning', + :delete => true) + message.contact.update_sent_alert_keys( + :media => message.medium, + :check => message.notification.event.id, + :state => 'critical', + :delete => true) + else + message.contact.update_sent_alert_keys( + :media => message.medium, + :check => message.notification.event.id, + :state => message.notification.event.state) + end # TODO consider changing Resque jobs to use raw blpop like the others case media_type.to_sym when :sms Resque.enqueue_to(@queues[:sms], Flapjack::Gateways::SmsMessagenet, contents)