lib/ring/sqa/alarm.rb in ring-sqa-0.0.19 vs lib/ring/sqa/alarm.rb in ring-sqa-0.0.20

- old
+ new

@@ -1,11 +1,11 @@ require_relative 'alarm/email' require_relative 'alarm/udp2irc' require_relative 'alarm/cfg' +require_relative 'alarm/message' require_relative 'mtr' require_relative 'paste' -require_relative 'nodes_json' module Ring class SQA class Alarm @@ -28,23 +28,23 @@ end end private - def initialize + def initialize nodes + @nodes = nodes @methods = [] @methods << Email.new if CFG.email.to? @methods << UDP2IRC.new if CFG.irc.password? + @hostname = Ring::SQA::CFG.host.name @alarm = false - @hostname = (Socket.gethostname rescue 'anonymous') end def compose_message alarm_buffer exceeding_nodes = alarm_buffer.exceeding_nodes msg = {short: "#{@hostname}: raising alarm - #{exceeding_nodes.size} new nodes down"} - nodes_json = NodesJSON.new - exceeding_nodes = exceeding_nodes.map { |node| nodes_json.get node } + exceeding_nodes = exceeding_nodes.map { |node| @nodes.get node } nodes_list = '' exceeding_nodes.sort_by{ |node| node[:cc] }.each do |node| nodes_list << "- %-35s %15s AS%-6s %2s\n" % [node[:name], node[:ip], node[:as], node[:cc]] end @@ -62,43 +62,10 @@ buffer_list << "%2s min ago %3s measurements failed" % [time, ary.size/2] buffer_list << (time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n") time -= 1 end - msg[:long] = <<EOF -Regarding: #{hostname} - -This is an automated alert from the distributed partial outage -monitoring system "RING SQA". - -At #{Time.now.utc} the following measurements were analysed -as indicating that there is a high probability your NLNOG RING node -cannot reach the entire internet. Possible causes could be an outage -in your upstream's or peer's network. - -The following #{exceeding_nodes.size} nodes previously were reachable, but became unreachable -over the course of the last 3 minutes: - -#{nodes_list} - -As a debug starting point 3 traceroutes were launched right after -detecting the event, they might assist in pinpointing what broke: - -#{mtr_list} - -An alarm is raised under the following conditions: every 30 seconds -your node pings all other nodes. The amount of nodes that cannot be -reached is stored in a circular buffer, with each element representing -a minute of measurements. In the event that the last three minutes are -#{Ring::SQA::CFG.analyzer.tolerance} above the median of the previous 27 measurement slots, a partial -outage is assumed. The ring buffer's output is as following: - -#{buffer_list} - -Kind regards, - -NLNOG RING -EOF + msg[:long] = message nodes_list, mtr_list, buffer_list msg end end