lib/ring/sqa/alarm.rb in ring-sqa-0.0.18 vs lib/ring/sqa/alarm.rb in ring-sqa-0.0.19

- old
+ new

@@ -28,70 +28,69 @@ end end private - def initialize database - @db = database + def initialize @methods = [] @methods << Email.new if CFG.email.to? @methods << UDP2IRC.new if CFG.irc.password? @alarm = false @hostname = (Socket.gethostname rescue 'anonymous') end def compose_message alarm_buffer exceeding_nodes = alarm_buffer.exceeding_nodes msg = {short: "#{@hostname}: raising alarm - #{exceeding_nodes.size} new nodes down"} - nodes = NodesJSON.new + nodes_json = NodesJSON.new + exceeding_nodes = exceeding_nodes.map { |node| nodes_json.get node } nodes_list = '' - exceeding_nodes.sort!.each do |node| - json = nodes.get node - nodes_list << "- %-30s %14s AS%5s %2s\n" % [json['hostname'], node, json['asn'], json['countrycode']] + exceeding_nodes.sort_by{ |node| node[:cc] }.each do |node| + nodes_list << "- %-35s %15s AS%-6s %2s\n" % [node[:name], node[:ip], node[:as], node[:cc]] end mtr_list = '' exceeding_nodes.sample(3).each do |node| - json = nodes.get node - mtr_list << "%-30s AS%5s (%2s)\n" % [json['hostname'], json['asn'], json['countrycode']] - mtr_list << MTR.run(node) + mtr_list << "%-35s AS%-6s (%2s)\n" % [node[:name], node[:as], node[:cc]] + mtr_list << MTR.run(node[:ip]) mtr_list << "\n" end buffer_list = '' time = alarm_buffer.array.size-1 alarm_buffer.array.each do |ary| buffer_list << "%2s min ago %3s measurements failed" % [time, ary.size/2] - type = time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n" - buffer_list << type + buffer_list << (time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n") time -= 1 end msg[:long] = <<EOF -This is an automated alert from the distributed partial outage +Regarding: #{hostname} + +This is an automated alert from the distributed partial outage monitoring system "RING SQA". -At #{Time.now.utc} the following measurements were analysed -as indicating that there is a high probability your NLNOG RING node -cannot reach the entire internet. Possible causes could be an outage +At #{Time.now.utc} the following measurements were analysed +as indicating that there is a high probability your NLNOG RING node +cannot reach the entire internet. Possible causes could be an outage in your upstream's or peer's network. -The following nodes previously were reachable, but became unreachable +The following #{exceeding_nodes.size} nodes previously were reachable, but became unreachable over the course of the last 3 minutes: #{nodes_list} -As a debug starting point 3 traceroutes were launched right after +As a debug starting point 3 traceroutes were launched right after detecting the event, they might assist in pinpointing what broke: #{mtr_list} -An alarm is raised under the following conditions: every 30 seconds -your node pings all other nodes. The amount of nodes that cannot be -reached is stored in a circular buffer, with each element representing -a minute of measurements. In the event that the last three minutes are -#{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial +An alarm is raised under the following conditions: every 30 seconds +your node pings all other nodes. The amount of nodes that cannot be +reached is stored in a circular buffer, with each element representing +a minute of measurements. In the event that the last three minutes are +#{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial outage is assumed. The ring buffer's output is as following: #{buffer_list} Kind regards,