alarm.rb in ring-sqa-0.0.18

- old
+ new

@@ -1,9 +1,10 @@
 require_relative 'alarm/email'
 require_relative 'alarm/udp2irc'
 require_relative 'alarm/cfg'
 require_relative 'mtr'
+require_relative 'paste'
 require_relative 'nodes_json'
 
 module Ring
 class SQA
 
@@ -42,11 +43,11 @@
       exceeding_nodes = alarm_buffer.exceeding_nodes
       msg = {short: "#{@hostname}: raising alarm - #{exceeding_nodes.size} new nodes down"}
       nodes = NodesJSON.new
 
       nodes_list = ''
-      exceeding_nodes.each do |node|
+      exceeding_nodes.sort!.each do |node|
         json = nodes.get node
         nodes_list << "- %-30s %14s AS%5s %2s\n" % [json['hostname'], node, json['asn'], json['countrycode']]
       end
 
       mtr_list = ''
@@ -58,27 +59,40 @@
       end
 
       buffer_list = ''
       time = alarm_buffer.array.size-1
       alarm_buffer.array.each do |ary|
-        buffer_list << "%2s min ago %3s measurements failed\n" % [time, ary.size/2]
+        buffer_list << "%2s min ago %3s measurements failed" % [time, ary.size/2]
+        type = time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n"
+        buffer_list << type
         time -= 1
       end
 
       msg[:long] = <<EOF
-This is an automated alert from the distributed partial outage monitoring system "RING SQA".
+This is an automated alert from the distributed partial outage 
+monitoring system "RING SQA".
 
-At #{Time.now.utc} the following measurements were analysed as indicating that there is a high probability your NLNOG RING node cannot reach the entire internet. Possible causes could be an outage in your upstream's or peer's network.
+At #{Time.now.utc} the following measurements were analysed 
+as indicating that there is a high probability your NLNOG RING node 
+cannot reach the entire internet. Possible causes could be an outage 
+in your upstream's or peer's network.
 
-The following nodes previously were reachable, but became unreachable over the course of the last 3 minutes:
+The following nodes previously were reachable, but became unreachable 
+over the course of the last 3 minutes:
 
 #{nodes_list}
 
-As a debug starting point 3 traceroutes were launched right after detecting the event, they might assist in pinpointing what broke:
+As a debug starting point 3 traceroutes were launched right after 
+detecting the event, they might assist in pinpointing what broke:
 
 #{mtr_list}
 
-An alarm is raised under the following conditions: every 30 seconds your node pings all other nodes. The amount of nodes that cannot be reached is stored in a circular buffer, with each element representing a minute of measurements. In the event that the last three minutes are #{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial outage is assumed. The ring buffer's output is as following:
+An alarm is raised under the following conditions: every 30 seconds 
+your node pings all other nodes. The amount of nodes that cannot be 
+reached is stored in a circular buffer, with each element representing 
+a minute of measurements. In the event that the last three minutes are 
+#{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial 
+outage is assumed. The ring buffer's output is as following:
 
 #{buffer_list}
 
 Kind regards,