lib/ring/sqa/alarm.rb in ring-sqa-0.0.18 vs lib/ring/sqa/alarm.rb in ring-sqa-0.0.19
- old
+ new
@@ -28,70 +28,69 @@
end
end
private
- def initialize database
- @db = database
+ def initialize
@methods = []
@methods << Email.new if CFG.email.to?
@methods << UDP2IRC.new if CFG.irc.password?
@alarm = false
@hostname = (Socket.gethostname rescue 'anonymous')
end
def compose_message alarm_buffer
exceeding_nodes = alarm_buffer.exceeding_nodes
msg = {short: "#{@hostname}: raising alarm - #{exceeding_nodes.size} new nodes down"}
- nodes = NodesJSON.new
+ nodes_json = NodesJSON.new
+ exceeding_nodes = exceeding_nodes.map { |node| nodes_json.get node }
nodes_list = ''
- exceeding_nodes.sort!.each do |node|
- json = nodes.get node
- nodes_list << "- %-30s %14s AS%5s %2s\n" % [json['hostname'], node, json['asn'], json['countrycode']]
+ exceeding_nodes.sort_by{ |node| node[:cc] }.each do |node|
+ nodes_list << "- %-35s %15s AS%-6s %2s\n" % [node[:name], node[:ip], node[:as], node[:cc]]
end
mtr_list = ''
exceeding_nodes.sample(3).each do |node|
- json = nodes.get node
- mtr_list << "%-30s AS%5s (%2s)\n" % [json['hostname'], json['asn'], json['countrycode']]
- mtr_list << MTR.run(node)
+ mtr_list << "%-35s AS%-6s (%2s)\n" % [node[:name], node[:as], node[:cc]]
+ mtr_list << MTR.run(node[:ip])
mtr_list << "\n"
end
buffer_list = ''
time = alarm_buffer.array.size-1
alarm_buffer.array.each do |ary|
buffer_list << "%2s min ago %3s measurements failed" % [time, ary.size/2]
- type = time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n"
- buffer_list << type
+ buffer_list << (time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n")
time -= 1
end
msg[:long] = <<EOF
-This is an automated alert from the distributed partial outage
+Regarding: #{hostname}
+
+This is an automated alert from the distributed partial outage
monitoring system "RING SQA".
-At #{Time.now.utc} the following measurements were analysed
-as indicating that there is a high probability your NLNOG RING node
-cannot reach the entire internet. Possible causes could be an outage
+At #{Time.now.utc} the following measurements were analysed
+as indicating that there is a high probability your NLNOG RING node
+cannot reach the entire internet. Possible causes could be an outage
in your upstream's or peer's network.
-The following nodes previously were reachable, but became unreachable
+The following #{exceeding_nodes.size} nodes previously were reachable, but became unreachable
over the course of the last 3 minutes:
#{nodes_list}
-As a debug starting point 3 traceroutes were launched right after
+As a debug starting point 3 traceroutes were launched right after
detecting the event, they might assist in pinpointing what broke:
#{mtr_list}
-An alarm is raised under the following conditions: every 30 seconds
-your node pings all other nodes. The amount of nodes that cannot be
-reached is stored in a circular buffer, with each element representing
-a minute of measurements. In the event that the last three minutes are
-#{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial
+An alarm is raised under the following conditions: every 30 seconds
+your node pings all other nodes. The amount of nodes that cannot be
+reached is stored in a circular buffer, with each element representing
+a minute of measurements. In the event that the last three minutes are
+#{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial
outage is assumed. The ring buffer's output is as following:
#{buffer_list}
Kind regards,