lib/riemann/babbler/plugins/runit.rb in riemann-babbler-1.2.6 vs lib/riemann/babbler/plugins/runit.rb in riemann-babbler-1.2.7

- old
+ new

@@ -1,12 +1,13 @@ class Riemann::Babbler::Runit < Riemann::Babbler def init plugin.set_default(:service, 'runit') plugin.set_default(:not_monit, ['riemann-client']) - plugin.set_default(:uptime, 10) plugin.set_default(:interval, 60) + + @status_history = Array.new end def run_plugin Dir.exists? '/etc/service' end @@ -16,22 +17,47 @@ pid_file = File.join(service, 'supervise', 'pid') return 0 unless File.exist?(pid_file) Time.now.to_i - File.mtime(pid_file).to_i end + def runned?(service) + stat_file = File.join(service, 'supervise', 'stat') + return false unless File.exists?(stat_file) + File.read( stat_file ).strip == 'run' + end + + def human_srv(service) + service.gsub(/\/etc\/service\//, '') + end + + def not_monit?(service) + plugin.not_monit.include? human_srv(service) + end + def read_run_status status = Array.new Dir.glob('/etc/service/*').each do |srv| - human_srv = srv.gsub(/\/etc\/service\//, '') - next if plugin.not_monit.include? human_srv - stat_file = File.join(srv, 'supervise', 'stat') - next unless File.exists? stat_file + + next if not_monit?(srv) srv_uptime = uptime(srv) - if (File.read( stat_file ).strip == 'run') && (srv_uptime > plugin.uptime) - status << {:service => plugin.service + ' ' + human_srv , :state => 'ok', :description => "runit service #{human_srv} running", :metric => srv_uptime} + srv_runned = runned?(srv) + srv_name = human_srv(srv) + + # сервис запущен и работает дольше чем мы приходили к нему в прошлый раз + if srv_runned && srv_uptime > plugin.interval + @status_history.delete(srv_name) + status << {:service => plugin.service + ' ' + srv_name , :state => 'ok', :description => "runit service #{srv_name} running", :metric => srv_uptime} else - status << {:service => plugin.service + ' ' + human_srv , :state => 'critical', :description => "runit service #{human_srv} not running", :metric => srv_uptime} + # сервис запущен но работает подозрительно мало, но последний раз замечен не был + if srv_uptime < plugin.interval && srv_runned && !@status_history.include?(srv_name) + # просто его запоминаем + @status_history << srv_name + else + # во всех остальных случаях сообщаем о проблеме + status << {:service => plugin.service + ' ' + srv_name , :state => 'critical', :description => "runit service #{srv_name} not running", :metric => srv_uptime} + end end + end status end def collect