tools/riemann-docker/bin/riemann-docker in riemann-tools-1.0.0 vs tools/riemann-docker/bin/riemann-docker in riemann-tools-1.1.0

- old
+ new

@@ -1,217 +1,206 @@ #!/usr/bin/env ruby -Process.setproctitle($0) +# frozen_string_literal: true +Process.setproctitle($PROGRAM_NAME) + # Reports current CPU, disk, load average, and memory use to riemann. require 'riemann/tools' -class Riemann::Tools::DockerHealth - require 'docker' - require 'socket' - include Riemann::Tools - include Docker +module Riemann + module Tools + class DockerHealth + require 'docker' + require 'socket' + include Riemann::Tools + include Docker - opt :docker_host, "Docker Container Host (see https://github.com/swipely/docker-api#host)", :type => String, :default => nil - opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9 - opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95 - opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9 - opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95 - opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85 - opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95 - opt :host_hostname, "Suffix of host", :type => String, :default => nil - opt :checks, "A list of checks to run.", :type => :strings, :default => ['cpu', 'memory', 'disk', 'basic'] + opt :docker_host, 'Docker Container Host (see https://github.com/swipely/docker-api#host)', type: String, + default: nil + opt :cpu_warning, 'CPU warning threshold (fraction of total jiffies)', default: 0.9 + opt :cpu_critical, 'CPU critical threshold (fraction of total jiffies)', default: 0.95 + opt :disk_warning, 'Disk warning threshold (fraction of space used)', default: 0.9 + opt :disk_critical, 'Disk critical threshold (fraction of space used)', default: 0.95 + opt :memory_warning, 'Memory warning threshold (fraction of RAM)', default: 0.85 + opt :memory_critical, 'Memory critical threshold (fraction of RAM)', default: 0.95 + opt :host_hostname, 'Suffix of host', type: String, default: nil + opt :checks, 'A list of checks to run.', type: :strings, default: %w[cpu memory disk basic] - def get_containers - Docker::Container.all - end + def containers + Docker::Container.all + end - def get_container_name(container) - container.json['Name'][1..-1] - end + def get_container_name(container) + container.json['Name'][1..] + end - def initialize + def initialize + Docker.url = opts[:docker_host] unless opts[:docker_host].nil? - if (opts[:docker_host] != nil) - Docker.url = opts[:docker_host] - end + @hostname = opts[:host_hostname] + @hostname = Socket.gethostname if @hostname.nil? || !(@hostname.is_a? String) || @hostname.empty? - @hostname = opts[:host_hostname] - if (@hostname.nil? || !(@hostname.is_a? String) || @hostname.empty?) - @hostname = Socket.gethostname - end + @cpu_coefficient = 1000 * 1000 * 1000 - @cpu_coefficient = 1000 * 1000 * 1000 + @limits = { + cpu: { critical: opts[:cpu_critical], warning: opts[:cpu_warning] }, + disk: { critical: opts[:disk_critical], warning: opts[:disk_warning] }, + memory: { critical: opts[:memory_critical], warning: opts[:memory_warning] }, + } - @limits = { - :cpu => {:critical => opts[:cpu_critical], :warning => opts[:cpu_warning]}, - :disk => {:critical => opts[:disk_critical], :warning => opts[:disk_warning]}, - :memory => {:critical => opts[:memory_critical], :warning => opts[:memory_warning]} - } + @last_cpu_reads = {} + @last_uptime_reads = {} - @last_cpu_reads = Hash.new - @last_uptime_reads = Hash.new - - opts[:checks].each do |check| - case check - when 'disk' - @disk_enabled = true - when 'cpu' - @cpu_enabled = true - when 'memory' - @memory_enabled = true - when 'basic' - @basic_inspection_enabled = true + opts[:checks].each do |check| + case check + when 'disk' + @disk_enabled = true + when 'cpu' + @cpu_enabled = true + when 'memory' + @memory_enabled = true + when 'basic' + @basic_inspection_enabled = true + end + end end - end - end - def alert(container, service, state, metric, description) + def alert(container, service, state, metric, description) + opts = { + service: service.to_s, + state: state.to_s, + metric: metric.to_f, + description: description, + } - opts = { :service => service.to_s, - :state => state.to_s, - :metric => metric.to_f, - :description => description } + opts[:host] = if !container.nil? + "#{@hostname}-#{container}" + else + @hostname + end - if (container != nil) - opts[:host] = "#{@hostname}-#{container}" - else - opts[:host] = @hostname - end + report(opts) + end - report(opts) - end + def report_pct(container, service, fraction, report = '', name = nil) + return unless fraction - def report_pct(container, service, fraction, report = '', name = nil) - if fraction + name = service if name.nil? - if (name == nil) - name = service + if fraction > @limits[service][:critical] + alert container, name, :critical, fraction, "#{format('%.2f', fraction * 100)}% #{report}" + elsif fraction > @limits[service][:warning] + alert container, name, :warning, fraction, "#{format('%.2f', fraction * 100)}% #{report}" + else + alert container, name, :ok, fraction, "#{format('%.2f', fraction * 100)}% #{report}" + end end - if fraction > @limits[service][:critical] - alert container, name, :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" - elsif fraction > @limits[service][:warning] - alert container, name, :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" - else - alert container, name, :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" - end - end - end + def cpu(id, name, stats) + current = stats['precpu_stats']['cpu_usage']['total_usage'] / stats['precpu_stats']['cpu_usage']['percpu_usage'].count + unless current + alert name, :cpu, :unknown, nil, 'no total usage found in docker remote api stats' + return false + end - def cpu(id, name, stats) + current_time = Time.parse(stats['read']) + unless @last_cpu_reads[id].nil? + last = @last_cpu_reads[id] + used = (current - last[:v]) / (current_time - last[:t]) / @cpu_coefficient - current = stats['precpu_stats']['cpu_usage']['total_usage'] / stats['precpu_stats']['cpu_usage']['percpu_usage'].count + report_pct name, :cpu, used + end - unless current - alert name, :cpu, :unknown, nil, 'no total usage found in docker remote api stats' - return false - end + @last_cpu_reads[id] = { v: current, t: current_time } + end - current_time = Time.parse(stats['read']); - if (@last_cpu_reads[id] != nil) - last = @last_cpu_reads[id] - used = (current - last[:v]) / (current_time - last[:t]) / @cpu_coefficient + def memory(_id, name, stats) + memory_stats = stats['memory_stats'] + usage = memory_stats['usage'].to_f + total = memory_stats['limit'].to_f + fraction = (usage / total) - report_pct name, :cpu, used - end + report_pct name, :memory, fraction, "#{usage} / #{total}" + end - @last_cpu_reads[id] = { v: current, t: current_time } - end + def disk + `df -P`.split(/\n/).each do |r| + f = r.split(/\s+/) + next if f[0] == 'Filesystem' + next unless f[0] =~ %r{/} # Needs at least one slash in the mount path - def memory(id, name, stats) - memory_stats = stats['memory_stats'] - usage = memory_stats['usage'].to_f - total = memory_stats['limit'].to_f - fraction = (usage / total) + # Calculate capacity + x = f[4].to_f / 100 + report_pct(nil, :disk, x, "#{f[3].to_i / 1024} mb left", "disk #{f[5]}") + end + end - report_pct name, :memory, fraction, "#{usage} / #{total}" - end + def basic_inspection(id, name, inspection) + state = inspection['State'] + json_state = JSON.generate(state) - def disk - `df -P`.split(/\n/).each do |r| - f = r.split(/\s+/) - next if f[0] == 'Filesystem' - next unless f[0] =~ /\// # Needs at least one slash in the mount path + running = state['Running'] - # Calculate capacity - x = f[4].to_f/100 - report_pct(nil, :disk, x, "#{f[3].to_i / 1024} mb left", "disk #{f[5]}") - end - end + alert( + name, 'status', + running ? 'ok' : 'critical', + running ? 1 : 0, + json_state, + ) - def basic_inspection(id, name, inspection) + return unless running - state = inspection['State'] - json_state = JSON.generate(state) + start_time = DateTime.rfc3339(state['StartedAt']).to_time.utc.to_i + now = DateTime.now.to_time.utc.to_i + uptime = now - start_time - running = state['Running'] + unless @last_uptime_reads[id].nil? + last = @last_uptime_reads[id] + restarted = start_time != last + alert( + name, 'uptime', + restarted ? 'critical' : 'ok', + uptime, + "last 'StartedAt' measure was #{last} (#{Time.at(last).utc}), " \ + "now it's #{start_time} (#{Time.at(start_time).utc})", + ) + end - alert(name, "status", - running ? "ok" : "critical", - running ? 1 : 0, - json_state) - - if (running) - start_time = DateTime.rfc3339(state['StartedAt']).to_time.utc.to_i - now = DateTime.now.to_time.utc.to_i - uptime = now - start_time - - if (@last_uptime_reads[id] != nil) - last = @last_uptime_reads[id] - restarted = start_time != last - alert(name, "uptime", - restarted ? "critical" : "ok", - uptime, - "last 'StartedAt' measure was #{last} (#{Time.at(last).utc.to_s}), " + - "now it's #{start_time} (#{Time.at(start_time).utc.to_s})") + @last_uptime_reads[id] = start_time end - @last_uptime_reads[id] = start_time - end - end + def tick + # Disk is the same in every container + disk if @disk_enabled - def tick + # Get CPU, Memory and Load of each container + threads = [] - # Disk is the same in every container - if @disk_enabled - disk() - end + containers.each do |ctr| + threads << Thread.new(ctr) do |container| + id = container.id + name = get_container_name(container) - # Get CPU, Memory and Load of each container - containers = get_containers() - threads = [] + stats = Docker::Util.parse_json(container.connection.get("/containers/#{id}/stats", { stream: false })) - containers.each do |ctr| - threads << Thread.new(ctr) do |container| - - id = container.id - name = get_container_name(container) - - stats = Docker::Util.parse_json(container.connection.get("/containers/#{id}/stats", {stream:false})) - - if @basic_inspection_enabled - inspection = Docker::Util.parse_json(container.connection.get("/containers/#{id}/json")) - basic_inspection(id, name, inspection) + if @basic_inspection_enabled + inspection = Docker::Util.parse_json(container.connection.get("/containers/#{id}/json")) + basic_inspection(id, name, inspection) + end + cpu(id, name, stats) if @cpu_enabled + memory(id, name, stats) if @memory_enabled end - if @cpu_enabled - cpu(id, name, stats) end - if @memory_enabled - memory(id, name, stats) + + threads.each do |thread| + thread.join + rescue StandardError => e + warn "#{e.class} #{e}\n#{e.backtrace.join "\n"}" end end end - - threads.each do |thread| - begin - thread.join - rescue => e - $stderr.puts "#{e.class} #{e}\n#{e.backtrace.join "\n"}" - end - end end end Riemann::Tools::DockerHealth.run -