require_relative 'alarm/email' require_relative 'alarm/udp2irc' require_relative 'alarm/cfg' require_relative 'mtr' require_relative 'nodes_json' module Ring class SQA class Alarm def set alarm_buffer if @alarm == false @alarm = true msg = compose_message alarm_buffer Log.info msg[:short] @methods.each { |alarm_method| alarm_method.send msg } end end def clear if @alarm == true @alarm = false msg = { short: "#{@hostname}: clearing alarm" } msg[:long] = msg[:short] Log.info msg[:short] @methods.each { |alarm_method| alarm_method.send msg } end end private def initialize database @db = database @methods = [] @methods << Email.new if CFG.email.to? @methods << UDP2IRC.new if CFG.irc.password? @alarm = false @hostname = (Socket.gethostname rescue 'anonymous') end def compose_message alarm_buffer exceeding_nodes = alarm_buffer.exceeding_nodes msg = {short: "#{@hostname}: raising alarm - #{exceeding_nodes.size} new nodes down"} nodes = NodesJSON.new nodes_list = '' exceeding_nodes.each do |node| json = nodes.get node nodes_list << "- %-30s %14s AS%5s %2s\n" % [json['hostname'], node, json['asn'], json['countrycode']] end mtr_list = '' exceeding_nodes.sample(3).each do |node| json = nodes.get node mtr_list << "%-30s AS%5s (%2s)\n" % [json['hostname'], json['asn'], json['countrycode']] mtr_list << MTR.run(node) mtr_list << "\n" end buffer_list = '' time = alarm_buffer.size-1 alarm_buffer.array.each do |ary| buffer_list << "%2s min ago %3s measurements failed\n" % [time, ary.size/2] time -= 1 end msg[:long] = <<EOF This is an automated alert from the distributed partial outage monitoring system "RING SQA". At #{Time.now.utc} the following measurements were analysed as indicating that there is a high probability your NLNOG RING node cannot reach the entire internet. Possible causes could be an outage in your upstream's or peer's network. The following nodes previously were reachable, but became unreachable over the course of the last 3 minutes: #{nodes_list} As a debug starting point 3 traceroutes were launched right after detecting the event, they might assist in pinpointing what broke: #{mtr_list} An alarm is raised under the following conditions: every 30 seconds your node pings all other nodes. The amount of nodes that cannot be reached is stored in a circular buffer, with each element representing a minute of measurements. In the event that the last three minutes are #{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial outage is assumed. The ring buffer's output is as following: #{buffer_list} Kind regards, NLNOG RING EOF msg end end end end