#!/usr/bin/env ruby # Forwards information on a Riak node to Riemann. require File.expand_path('../../lib/riemann/tools', __FILE__) require 'net/http' require 'net/https' require 'yajl/json_gem' class Riemann::Tools::Riak include Riemann::Tools opt :riak_host, "Riak host for stats or SSL http(s)://", :default => Socket.gethostname opt :data_dir, "Riak data directory", :default => '/var/lib/riak' opt :stats_port, "Riak HTTP port for stats", :default => 8098 opt :stats_path, "Riak HTTP stats path", :default => '/stats' opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}" opt :cookie, "Riak cookie to use", :default => "riak" opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000 opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000 opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000 opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000 opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000 opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000 def initialize detect_features @httpstatus = true # What's going on here? --aphyr if begin uri = URI.parse(opts[:riak_host]) if uri.host == nil uri.host = opts[:riak_host] end http = Net::HTTP.new(uri.host, opts[:stats_port]) http.use_ssl = uri.scheme == 'https' if http.use_ssl? http.verify_mode = OpenSSL::SSL::VERIFY_NONE end http.start do |http| http.get opts[:stats_path] end rescue => e @httpstatus = false end end # we're going to override the emulator setting to allow users to # dynamically input the cookie # this is done only once - hopefully it doesn't get overridden. ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}" end # Identifies whether escript and riak-admin are installed def detect_features @escript = true # Whether escript is present on this machine @riakadmin = true # Whether riak-admin is present if `which escript` =~ /^\s*$/ @escript = false end if `which riak-admin` =~ /^\s*$/ @riakadmin = false end end def check_ring if @escript str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp elsif @riakadmin str = `riak-admin ringready` end if str =~ /^TRUE/ report( :host => opts[:riak_host], :service => 'riak ring', :state => 'ok', :description => str ) else report( :host => opts[:riak_host], :service => 'riak ring', :state => 'warning', :description => str ) end end def check_keys keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp if keys =~ /^\d+$/ report( :host => opts[:riak_host], :service => 'riak keys', :state => 'ok', :metric => keys.to_i, :description => keys ) else report( :host => opts[:riak_host], :service => 'riak keys', :state => 'unknown', :description => keys ) end end def check_disk gb = `du -s #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2) report( :host => opts[:riak_host], :service => 'riak disk', :state => 'ok', :metric => gb, :description => "#{gb} GB in #{opts[:data_dir]}" ) end # Returns the riak stat for the given fsm type and percentile. def fsm_stat(type, percentile) "node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}" end # Returns the alerts state for the given fsm. def fsm_state(type, percentile, val) limit = opts["#{type}_#{percentile}_warning".to_sym] case val when 0 .. limit 'ok' when limit .. limit * 2 'warning' else 'critical' end end # Get current stats via HTTP def stats_http begin uri = URI.parse(opts[:riak_host]) if uri.host == nil uri.host = opts[:riak_host] end http = Net::HTTP.new(uri.host, opts[:stats_port]) http.use_ssl = uri.scheme == 'https' if http.use_ssl? http.verify_mode = OpenSSL::SSL::VERIFY_NONE end res = http.start do |http| http.get opts[:stats_path] end rescue => e report( :host => opts[:riak_host], :service => 'riak', :state => 'critical', :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}" ) raise end if res.code.to_i == 200 return JSON.parse(res.body) else report( :host => opts[:riak_host], :service => 'riak', :state => 'critical', :description => "stats returned HTTP #{res.code}:\n\n#{res.body}" ) raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}" end end # Get current stats via riak-admin def stats_riak_admin str = `riak-admin status` raise "riak-admin failed" unless $? == 0 Hash[str.split(/\n/).map{|i| i.split(/ : /)}] end # Get current stats as a hash def stats if @httpstatus stats_http elsif @riakadmin stats_riak_admin else report( :host => opts[:riak_host], :service => 'riak', :state => 'critical', :description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available." ) raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available." end end def core_services ['vnode_gets', 'vnode_puts', 'node_gets', 'node_puts', 'read_repairs'] end def fsm_types ['get', 'put'] end def fsm_percentiles [50, 95, 99] end # Reports current stats to Riemann def check_stats begin stats = self.stats rescue => e event = {:state => 'critical', :description => e.message, :host => opts[:riak_host]} # Report errors report(event.merge(:service => 'riak')) core_services.each do |s| report(event.merge(:service => "riak #{s}")) end fsm_types.each do |type| fsm_percentiles.each do |percentile| report(event.merge(:service => "riak #{type} #{percentile}")) end end return end # Riak itself report( :host => opts[:riak_host], :service => 'riak', :state => 'ok' ) # Gets/puts/rr core_services.each do |s| report( :host => opts[:riak_host], :service => "riak #{s}", :state => 'ok', :metric => stats[s].to_i/60.0, :description => "#{stats[s].to_i/60.0}/sec" ) end # FSMs fsm_types.each do |type| fsm_percentiles.each do |percentile| val = stats[fsm_stat(type, percentile)].to_i || 0 val = 0 if val == 'undefined' val /= 1000.0 # Convert us to ms state = fsm_state(type, percentile, val) report( :host => opts[:riak_host], :service => "riak #{type} #{percentile}", :state => state, :metric => val, :description => "#{val} ms" ) end end end def tick # This can utterly destroy a cluster, so we disable # check_keys check_stats check_ring check_disk end end Riemann::Tools::Riak.run