bin/riemann-proc in riemann-tools-0.2.8 vs bin/riemann-proc in riemann-tools-0.2.9
- old
+ new
@@ -5,13 +5,13 @@
require File.expand_path('../../lib/riemann/tools', __FILE__)
class Riemann::Tools::Proc
include Riemann::Tools
- opt :proc_regex, "regular expression that matches the process to be monitored", type: :string
- opt :proc_min_critical, "running process count minimum", :default => 1
- opt :proc_max_critical, "running process count maximum", :default => 1
+ opt :proc_regex, "regular expression that matches the process to be monitored", type: :string, :default => ".*"
+ opt :proc_min_critical, "running process count minimum", :default => 0
+ opt :proc_max_critical, "running process count maximum", :default => 65536
def initialize
@limits = { :critical => { :min => opts[:proc_min_critical], :max => opts[:proc_max_critical] } }
abort "FATAL: specify a process regular expression, see --help for usage" unless opts[:proc_regex]
@@ -30,15 +30,74 @@
)
end
def linux_proc
process = opts[:proc_regex]
- found = `ps axo args | grep '#{process}' | grep -v grep | grep -v riemann-proc`
+ found = `ps axo pid=,rss=,vsize=,state=,cputime=,lstart=,command= | grep '#{process}' | grep -v grep | grep -v riemann-proc`
running = found.count("\n")
if running > @limits[:critical][:max] or running < @limits[:critical][:min]
- alert "proc #{process}", :critical, running, "process #{process} is running #{running} instances:\n" + found
+ alert "proc count/#{process}", :critical, running, "process #{process} is running #{running} instances.\n"
else
- alert "proc #{process}", :ok, running, "process #{process} is running #{running} instances:\n" + found
+ alert "proc count/#{process}", :ok, running, "process #{process} is running #{running} instances.\n"
+ end
+ # Iterate on all the lines and create an entry for the following metrics:
+ #
+ # process/<pid>-<start-time>/rss
+ # process/<pid>-<start-time>/vsize
+ # process/<pid>-<start-time>/running
+ # process/<pid>-<start-time>/cputime
+ #
+ # description should contain the command itself.
+ # value should be either process RSS, VSIZE, or 1 if running
+ # state is always unknown for the moment
+ #
+ ps_regex = /([0-9]+)[ ]+([0-9]+)[ ]+([0-9]+)[ ]+([A-Z])[ ]+([0-9:.]+)[ ]+[A-Za-z]{3}[ ]+([A-Za-z]{3} [0-9]+ [0-9:]+ [0-9]+)[ ]+(.*)/
+ found.each_line do |line|
+ m = ps_regex.match(line)
+ if not m.nil?
+ pid, rss, vsize, state, cputime, start, command = m.captures
+ start_s = DateTime.parse(start, "Mmm DD HH:MM:ss YYYY").to_time.to_i
+ cputime_s = DateTime.parse(cputime, "%H:%M:%S")
+ cputime_seconds = (cputime_s.hour * 3600) + (cputime_s.minute * 60) + cputime_s.second
+ running = 0
+ case state[0]
+ when "R"
+ state_s = "ok"
+ running = 1
+ when "S"
+ state_s = "ok"
+ when "I"
+ state_s = "warning"
+ when "T", "U", "Z"
+ state_s = "critical"
+ else
+ state_s = "unknown"
+ end
+ report(
+ :service => "proc #{pid}-#{start_s}/rss",
+ :state => state_s.to_s,
+ :metric => rss.to_f,
+ :description => command,
+ )
+ report(
+ :service => "proc #{pid}-#{start_s}/vsize",
+ :state => state_s.to_s,
+ :metric => vsize.to_f,
+ :description => command,
+ )
+ report(
+ :service => "proc #{pid}-#{start_s}/running",
+ :state => state_s.to_s,
+ :metric => running.to_f,
+ :description => command,
+ )
+ report(
+ :service => "proc #{pid}-#{start_s}/cputime",
+ :state => state_s.to_s,
+ :metric => cputime_seconds,
+ :description => command,
+ )
+ end
end
end
def tick
@check.call