#!/usr/bin/env ruby # Reports current CPU, disk, load average, and memory use to riemann. require 'riemann/tools' class Riemann::Tools::DockerHealth require 'docker' require 'socket' include Riemann::Tools include Docker opt :docker_host, "Docker Container Host (see https://github.com/swipely/docker-api#host)", :default => nil opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9 opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95 opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9 opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95 opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85 opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95 opt :host_hostname, "Suffix of host", :type => String, :default => nil opt :checks, "A list of checks to run.", :type => :strings, :default => ['cpu', 'memory', 'disk', 'basic'] def get_containers Docker::Container.all end def get_container_name(container) container.json['Name'][1..-1] end def initialize if (opts[:docker_host] != nil) Docker.url = opts[:docker_host] end @hostname = opts[:host_hostname] if (@hostname.nil? || !(@hostname.is_a? String) || @hostname.empty?) @hostname = Socket.gethostname end @cpu_coefficient = 1000 * 1000 * 1000 @limits = { :cpu => {:critical => opts[:cpu_critical], :warning => opts[:cpu_warning]}, :disk => {:critical => opts[:disk_critical], :warning => opts[:disk_warning]}, :memory => {:critical => opts[:memory_critical], :warning => opts[:memory_warning]} } @last_cpu_reads = Hash.new @last_uptime_reads = Hash.new opts[:checks].each do |check| case check when 'disk' @disk_enabled = true when 'cpu' @cpu_enabled = true when 'memory' @memory_enabled = true when 'basic' @basic_inspection_enabled = true end end end def alert(container, service, state, metric, description) opts = { :service => service.to_s, :state => state.to_s, :metric => metric.to_f, :description => description } if (container != nil) opts[:host] = "#{@hostname}-#{container}" else opts[:host] = @hostname end report(opts) end def report_pct(container, service, fraction, report = '', name = nil) if fraction if (name == nil) name = service end if fraction > @limits[service][:critical] alert container, name, :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" elsif fraction > @limits[service][:warning] alert container, name, :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" else alert container, name, :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" end end end def cpu(id, name, stats) current = stats['precpu_stats']['cpu_usage']['total_usage'] / stats['precpu_stats']['cpu_usage']['percpu_usage'].count unless current alert name, :cpu, :unknown, nil, 'no total usage found in docker remote api stats' return false end current_time = Time.parse(stats['read']); if (@last_cpu_reads[id] != nil) last = @last_cpu_reads[id] used = (current - last[:v]) / (current_time - last[:t]) / @cpu_coefficient report_pct name, :cpu, used end @last_cpu_reads[id] = { v: current, t: current_time } end def memory(id, name, stats) memory_stats = stats['memory_stats'] usage = memory_stats['usage'].to_f total = memory_stats['limit'].to_f fraction = (usage / total) report_pct name, :memory, fraction, "#{usage} / #{total}" end def disk `df -P`.split(/\n/).each do |r| f = r.split(/\s+/) next if f[0] == 'Filesystem' next unless f[0] =~ /\// # Needs at least one slash in the mount path # Calculate capacity x = f[4].to_f/100 report_pct(nil, :disk, x, "#{f[3].to_i / 1024} mb left", "disk #{f[5]}") end end def basic_inspection(id, name, inspection) state = inspection['State'] json_state = JSON.generate(state) running = state['Running'] alert(name, "status", running ? "ok" : "critical", running ? 1 : 0, json_state) if (running) start_time = DateTime.rfc3339(state['StartedAt']).to_time.utc.to_i now = DateTime.now.to_time.utc.to_i uptime = now - start_time if (@last_uptime_reads[id] != nil) last = @last_uptime_reads[id] restarted = start_time != last alert(name, "uptime", restarted ? "critical" : "ok", uptime, "last 'StartedAt' measure was #{last} (#{Time.at(last).utc.to_s}), " + "now it's #{start_time} (#{Time.at(start_time).utc.to_s})") end @last_uptime_reads[id] = start_time end end def tick # Disk is the same in every container if @disk_enabled disk() end # Get CPU, Memory and Load of each container containers = get_containers() threads = [] containers.each do |ctr| threads << Thread.new(ctr) do |container| id = container.id name = get_container_name(container) stats = Docker::Util.parse_json(container.connection.get("/containers/#{id}/stats", {stream:false})) if @basic_inspection_enabled inspection = Docker::Util.parse_json(container.connection.get("/containers/#{id}/json")) basic_inspection(id, name, inspection) end if @cpu_enabled cpu(id, name, stats) end if @memory_enabled memory(id, name, stats) end end end threads.each do |thread| begin thread.join rescue => e $stderr.puts "#{e.class} #{e}\n#{e.backtrace.join "\n"}" end end end end Riemann::Tools::DockerHealth.run