#!/usr/bin/env ruby # Forwards information on a Riak node to Riemann. require File.expand_path('../../lib/riemann/tools', __FILE__) require 'net/http' require 'net/https' require 'yajl/json_gem' class Riemann::Tools::Riak include Riemann::Tools opt :riak_host, "Riak host for stats or SSL http(s)://", :default => Socket.gethostname opt :data_dir, "Riak data directory", :default => '/var/lib/riak' opt :stats_port, "Riak HTTP port for stats", :default => 8098 opt :stats_path, "Riak HTTP stats path", :default => '/stats' opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}" opt :cookie, "Riak cookie to use", :default => "riak" opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000 opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000 opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000 opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000 opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000 opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000 def initialize @escript = true @riakadmin = true @httpstatus = true if `which escript` =~ /^\s*$/ @escript = false end if `which riak-admin` =~ /^\s*$/ @riakadmin = false end if begin uri = URI.parse(opts[:riak_host]) if uri.host == nil uri.host = opts[:riak_host] end http = Net::HTTP.new(uri.host, opts[:stats_port]) http.use_ssl = uri.scheme == 'https' if http.use_ssl? http.verify_mode = OpenSSL::SSL::VERIFY_NONE end http.start do |http| http.get opts[:stats_path] end rescue => e @httpstatus = false end end # we're going to override the emulator setting to allow users to # dynamically input the cookie # this is done only once - hopefully it doesn't get overridden. ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}" end def check_ring if @escript str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp elsif @riakadmin str = `riak-admin ringready` end if str =~ /^TRUE/ report( :host => opts[:riak_host], :service => 'riak ring', :state => 'ok', :description => str ) else report( :host => opts[:riak_host], :service => 'riak ring', :state => 'warning', :description => str ) end end def check_keys keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp if keys =~ /^\d+$/ report( :host => opts[:riak_host], :service => 'riak keys', :state => 'ok', :metric => keys.to_i, :description => keys ) else report( :host => opts[:riak_host], :service => 'riak keys', :state => 'unknown', :description => keys ) end end def check_disk gb = `du -s #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2) report( :host => opts[:riak_host], :service => 'riak disk', :state => 'ok', :metric => gb, :description => "#{gb} GB in #{opts[:data_dir]}" ) end # Returns the riak stat for the given fsm type and percentile. def fsm_stat(type, percentile) "node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}" end # Returns the alerts state for the given fsm. def fsm_state(type, percentile, val) limit = opts["#{type}_#{percentile}_warning".to_sym] case val when 0 .. limit 'ok' when limit .. limit * 2 'warning' else 'critical' end end def check_stats if @httpstatus begin uri = URI.parse(opts[:riak_host]) if uri.host == nil uri.host = opts[:riak_host] end http = Net::HTTP.new(uri.host, opts[:stats_port]) http.use_ssl = uri.scheme == 'https' if http.use_ssl? http.verify_mode = OpenSSL::SSL::VERIFY_NONE end res = http.start do |http| http.get opts[:stats_path] end rescue => e report( :host => opts[:riak_host], :service => 'riak', :state => 'critical', :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}" ) return end if res.code.to_i == 200 stats = JSON.parse(res.body) else report( :host => opts[:riak_host], :service => 'riak', :state => 'critical', :description => "stats returned HTTP #{res.code}:\n\n#{res.body}" ) return end elsif @riakadmin stats = Hash[`riak-admin status`.split(/\n/).map{|i| i.split(/ : /)}] else report( :host => opts[:riak_host], :service => 'riak', :state => 'critical', :description => "error fetching Riak stats" ) return end report( :host => opts[:riak_host], :service => 'riak', :state => 'ok' ) # Gets/puts/rr [ 'vnode_gets', 'vnode_puts', 'node_gets', 'node_puts', 'read_repairs' ].each do |s| report( :host => opts[:riak_host], :service => "riak #{s}", :state => 'ok', :metric => stats[s].to_i/60.0, :description => "#{stats[s].to_i/60.0}/sec" ) end # FSMs ['get', 'put'].each do |type| [50, 95, 99].each do |percentile| val = stats[fsm_stat(type, percentile)].to_i || 0 val = 0 if val == 'undefined' val /= 1000.0 # Convert us to ms state = fsm_state(type, percentile, val) report( :host => opts[:riak_host], :service => "riak #{type} #{percentile}", :state => state, :metric => val, :description => "#{val} ms" ) end end end def tick # This can utterly destroy a cluster, so we disable # check_keys check_stats check_ring check_disk end end Riemann::Tools::Riak.run