lib/flapjack/executive.rb in flapjack-0.7.1 vs lib/flapjack/executive.rb in flapjack-0.7.2
- old
+ new
@@ -78,36 +78,50 @@
# FIXME: all of the below keys assume there is only ever one executive running;
# we could generate a fuid and save it to disk, and prepend it from that
# point on...
- # TODO unset on exit?
- @redis.set('boot_time', @boot_time.to_i)
-
# FIXME: add an administrative function to reset all event counters
if @redis.hget('event_counters', 'all').nil?
@redis.hset('event_counters', 'all', 0)
@redis.hset('event_counters', 'ok', 0)
@redis.hset('event_counters', 'failure', 0)
@redis.hset('event_counters', 'action', 0)
end
- @redis.zadd('executive_instances', @boot_time.to_i, @instance_id)
+ #@redis.zadd('executive_instances', @boot_time.to_i, @instance_id)
+ @redis.hset("executive_instance:#{@instance_id}", 'boot_time', @boot_time.to_i)
@redis.hset("event_counters:#{@instance_id}", 'all', 0)
@redis.hset("event_counters:#{@instance_id}", 'ok', 0)
@redis.hset("event_counters:#{@instance_id}", 'failure', 0)
@redis.hset("event_counters:#{@instance_id}", 'action', 0)
+ touch_keys
end
+ # expire instance keys after one week
+ # TODO: set up a separate EM timer to reset key expiry every minute
+ # and reduce the expiry to, say, five minutes
+ # TODO: remove these keys on process exit
+ def touch_keys
+ [ "executive_instance:#{@instance_id}",
+ "event_counters:#{@instance_id}",
+ "event_counters:#{@instance_id}",
+ "event_counters:#{@instance_id}",
+ "event_counters:#{@instance_id}" ].each {|key|
+ @redis.expire(key, 1036800)
+ }
+ end
+
def start
@logger.info("Booting main loop.")
until @should_quit
@logger.debug("Waiting for event...")
event = Flapjack::Data::Event.next(:redis => @redis,
:archive_events => @archive_events,
- :events_archive_maxage => @events_archive_maxage)
+ :events_archive_maxage => @events_archive_maxage,
+ :logger => @logger)
process_event(event) unless event.nil?
end
@logger.info("Exiting main loop.")
end
@@ -155,10 +169,14 @@
@logger.info("Generating notifications for event #{event.id}, #{event.type}, #{event.state}, #{event.summary}#{time_at_str}")
generate_notification_messages(event, entity_check)
end
def update_keys(event, entity_check)
+
+ # TODO: run touch_keys from a separate EM timer for efficiency
+ touch_keys
+
result = { :skip_filters => false }
timestamp = Time.now.to_i
@event_count = @redis.hincrby('event_counters', 'all', 1)
@redis.hincrby("event_counters:#{@instance_id}", 'all', 1)
@@ -241,26 +259,33 @@
notification_type = 'acknowledgement'
when 'test_notifications'
notification_type = 'test'
end
end
+
+ max_notified_severity = entity_check.max_notified_severity_of_current_failure
+
@redis.set("#{event.id}:last_#{notification_type}_notification", timestamp)
@redis.set("#{event.id}:last_#{event.state}_notification", timestamp) if event.failure?
@redis.rpush("#{event.id}:#{notification_type}_notifications", timestamp)
@redis.rpush("#{event.id}:#{event.state}_notifications", timestamp) if event.failure?
@logger.debug("Notification of type #{notification_type} is being generated for #{event.id}.")
contacts = entity_check.contacts
if contacts.empty?
+ @logger.debug("No contacts for #{event.id}")
@notifylog.info("#{Time.now.to_s} | #{event.id} | #{notification_type} | NO CONTACTS")
return
end
- notification = Flapjack::Data::Notification.for_event(event, :type => notification_type)
+ notification = Flapjack::Data::Notification.for_event(
+ event, :type => notification_type, :max_notified_severity => max_notified_severity)
- enqueue_messages( apply_notification_rules( notification.messages(:contacts => contacts) ) )
+ messages = notification.messages(:contacts => contacts)
+ messages = apply_notification_rules(messages)
+ enqueue_messages(messages)
end
# time restrictions match?
# nil rule.time_restrictions matches
@@ -289,12 +314,11 @@
@logger.debug "apply_notification_rules: got messages with size #{messages.size}"
# don't consider notification rules if the contact has none
tuple = messages.map do |message|
- @logger.debug "considering message: #{message.medium} #{message.notification.event.id} #{message.notification.event.state}"
- @logger.debug "contact_id: #{message.contact.id}"
+ @logger.debug "considering message for contact: #{message.contact.id} #{message.medium} #{message.notification.event.id} #{message.notification.event.state}"
rules = message.contact.notification_rules
@logger.debug "found #{rules.length} rules for this message's contact"
event_id = message.notification.event.id
options = {}
options[:no_rules_for_contact] = true if rules.empty?
@@ -335,19 +359,36 @@
@logger.debug "apply_notification_rules: num messages after removing blackhole matches: #{tuple.size}"
# delete any media that doesn't meet severity<->media constraints
tuple = tuple.find_all do |message, matchers, options|
- severity = message.notification.event.state
+ state = message.notification.event.state
+ max_notified_severity = message.notification.max_notified_severity
+
+ # use EntityCheck#max_notified_severity_of_current_failure
+ # as calculated prior to updating the last_notification* keys
+ # if it's a higher severity than the current state
+ severity = 'ok'
+ case
+ when ([state, max_notified_severity] & ['critical', 'unknown']).any?
+ severity = 'critical'
+ when [state, max_notified_severity].include?('warning')
+ severity = 'warning'
+ end
options[:no_rules_for_contact] ||
matchers.any? {|matcher|
- matcher.media_for_severity(severity).include?(message.medium) ||
- (@logger.warn("got nil for matcher.media_for_severity(#{severity}), matcher: #{matcher.inspect}") && false)
+ mms = matcher.media_for_severity(severity)
+ unless mms
+ answer = false
+ else
+ answer = mms.include?(message.medium)
+ end
+ answer
}
end
- @logger.debug "apply_notification_rules: num messages after severity-media constraints: #{tuple.size}"
+ @logger.debug "apply_notification_rules: num messages after pruning for severity-media constraints: #{tuple.size}"
# delete media based on notification interval
tuple = tuple.find_all do |message, matchers, options|
not message.contact.drop_notifications?(:media => message.medium,
:check => message.notification.event.id,
@@ -376,13 +417,26 @@
return
end
@logger.info("Enqueueing #{media_type} alert for #{event_id} to #{message.address}")
- message.contact.update_sent_alert_keys(:media => message.medium,
- :check => message.notification.event.id,
- :state => message.notification.event.state)
- # drop_alerts_for_contact:#{self.id}:#{media}:#{check}:#{state}
+ if message.notification.event.state == 'ok'
+ message.contact.update_sent_alert_keys(
+ :media => message.medium,
+ :check => message.notification.event.id,
+ :state => 'warning',
+ :delete => true)
+ message.contact.update_sent_alert_keys(
+ :media => message.medium,
+ :check => message.notification.event.id,
+ :state => 'critical',
+ :delete => true)
+ else
+ message.contact.update_sent_alert_keys(
+ :media => message.medium,
+ :check => message.notification.event.id,
+ :state => message.notification.event.state)
+ end
# TODO consider changing Resque jobs to use raw blpop like the others
case media_type.to_sym
when :sms
Resque.enqueue_to(@queues[:sms], Flapjack::Gateways::SmsMessagenet, contents)