#!/usr/bin/env ruby # Reports running process count to riemann. require File.expand_path('../../lib/riemann/tools', __FILE__) class Riemann::Tools::Proc include Riemann::Tools opt :proc_regex, "regular expression that matches the process to be monitored", type: :string, :default => ".*" opt :proc_min_critical, "running process count minimum", :default => 0 opt :proc_max_critical, "running process count maximum", :default => 65536 def initialize @limits = { :critical => { :min => opts[:proc_min_critical], :max => opts[:proc_max_critical] } } abort "FATAL: specify a process regular expression, see --help for usage" unless opts[:proc_regex] ostype = `uname -s`.chomp.downcase puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux" @check = method :linux_proc end def alert(service, state, metric, description) report( :service => service.to_s, :state => state.to_s, :metric => metric.to_f, :description => description ) end def linux_proc process = opts[:proc_regex] found = `ps axo pid=,rss=,vsize=,state=,cputime=,lstart=,command= | grep '#{process}' | grep -v grep | grep -v riemann-proc` running = found.count("\n") if running > @limits[:critical][:max] or running < @limits[:critical][:min] alert "proc count/#{process}", :critical, running, "process #{process} is running #{running} instances.\n" else alert "proc count/#{process}", :ok, running, "process #{process} is running #{running} instances.\n" end # Iterate on all the lines and create an entry for the following metrics: # # process/-/rss # process/-/vsize # process/-/running # process/-/cputime # # description should contain the command itself. # value should be either process RSS, VSIZE, or 1 if running # state is always unknown for the moment # ps_regex = /([0-9]+)[ ]+([0-9]+)[ ]+([0-9]+)[ ]+([A-Z])[ ]+([0-9:.]+)[ ]+[A-Za-z]{3}[ ]+([A-Za-z]{3} [0-9]+ [0-9:]+ [0-9]+)[ ]+(.*)/ found.each_line do |line| m = ps_regex.match(line) if not m.nil? pid, rss, vsize, state, cputime, start, command = m.captures start_s = DateTime.parse(start, "Mmm DD HH:MM:ss YYYY").to_time.to_i cputime_s = DateTime.parse(cputime, "%H:%M:%S") cputime_seconds = (cputime_s.hour * 3600) + (cputime_s.minute * 60) + cputime_s.second running = 0 case state[0] when "R" state_s = "ok" running = 1 when "S" state_s = "ok" when "I" state_s = "warning" when "T", "U", "Z" state_s = "critical" else state_s = "unknown" end report( :service => "proc #{pid}-#{start_s}/rss", :state => state_s.to_s, :metric => rss.to_f, :description => command, ) report( :service => "proc #{pid}-#{start_s}/vsize", :state => state_s.to_s, :metric => vsize.to_f, :description => command, ) report( :service => "proc #{pid}-#{start_s}/running", :state => state_s.to_s, :metric => running.to_f, :description => command, ) report( :service => "proc #{pid}-#{start_s}/cputime", :state => state_s.to_s, :metric => cputime_seconds, :description => command, ) end end end def tick @check.call end end Riemann::Tools::Proc.run