bin/riemann-riak in riemann-tools-0.1.8 vs bin/riemann-riak in riemann-tools-0.1.9
- old
+ new
@@ -24,22 +24,14 @@
opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
def initialize
- @escript = true
- @riakadmin = true
- @httpstatus = true
-
- if `which escript` =~ /^\s*$/
- @escript = false
- end
-
- if `which riak-admin` =~ /^\s*$/
- @riakadmin = false
- end
-
+ detect_features
+
+ @httpstatus = true
+ # What's going on here? --aphyr
if
begin
uri = URI.parse(opts[:riak_host])
if uri.host == nil
uri.host = opts[:riak_host]
@@ -54,15 +46,29 @@
end
rescue => e
@httpstatus = false
end
end
+
# we're going to override the emulator setting to allow users to
# dynamically input the cookie
# this is done only once - hopefully it doesn't get overridden.
ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
+ end
+ # Identifies whether escript and riak-admin are installed
+ def detect_features
+ @escript = true # Whether escript is present on this machine
+ @riakadmin = true # Whether riak-admin is present
+
+ if `which escript` =~ /^\s*$/
+ @escript = false
+ end
+
+ if `which riak-admin` =~ /^\s*$/
+ @riakadmin = false
+ end
end
def check_ring
if @escript
str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
@@ -134,83 +140,129 @@
else
'critical'
end
end
- def check_stats
- if @httpstatus
- begin
- uri = URI.parse(opts[:riak_host])
- if uri.host == nil
- uri.host = opts[:riak_host]
- end
- http = Net::HTTP.new(uri.host, opts[:stats_port])
- http.use_ssl = uri.scheme == 'https'
- if http.use_ssl?
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
- end
- res = http.start do |http|
- http.get opts[:stats_path]
+ # Get current stats via HTTP
+ def stats_http
+ begin
+ uri = URI.parse(opts[:riak_host])
+ if uri.host == nil
+ uri.host = opts[:riak_host]
end
- rescue => e
- report(
- :host => opts[:riak_host],
- :service => 'riak',
- :state => 'critical',
- :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
- )
- return
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
+ http.use_ssl = uri.scheme == 'https'
+ if http.use_ssl?
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
-
- if res.code.to_i == 200
- stats = JSON.parse(res.body)
- else
- report(
- :host => opts[:riak_host],
- :service => 'riak',
- :state => 'critical',
- :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
- )
- return
+ res = http.start do |http|
+ http.get opts[:stats_path]
end
+ rescue => e
+ report(
+ :host => opts[:riak_host],
+ :service => 'riak',
+ :state => 'critical',
+ :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
+ )
+ raise
+ end
+
+ if res.code.to_i == 200
+ return JSON.parse(res.body)
+ else
+ report(
+ :host => opts[:riak_host],
+ :service => 'riak',
+ :state => 'critical',
+ :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
+ )
+ raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
+ end
+ end
+
+ # Get current stats via riak-admin
+ def stats_riak_admin
+ str = `riak-admin status`
+ raise "riak-admin failed" unless $? == 0
+ Hash[str.split(/\n/).map{|i| i.split(/ : /)}]
+ end
+
+ # Get current stats as a hash
+ def stats
+ if @httpstatus
+ stats_http
elsif @riakadmin
- stats = Hash[`riak-admin status`.split(/\n/).map{|i| i.split(/ : /)}]
+ stats_riak_admin
else
- report(
- :host => opts[:riak_host],
- :service => 'riak',
- :state => 'critical',
- :description => "error fetching Riak stats"
- )
- return
+ report(
+ :host => opts[:riak_host],
+ :service => 'riak',
+ :state => 'critical',
+ :description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
+ )
+ raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
end
+ end
+ def core_services
+ ['vnode_gets',
+ 'vnode_puts',
+ 'node_gets',
+ 'node_puts',
+ 'read_repairs']
+ end
+
+ def fsm_types
+ ['get', 'put']
+ end
+
+ def fsm_percentiles
+ [50, 95, 99]
+ end
+
+ # Reports current stats to Riemann
+ def check_stats
+ begin
+ stats = self.stats
+ rescue => e
+ event = {:state => 'critical',
+ :description => e.message,
+ :host => opts[:riak_host]}
+ # Report errors
+ report(event.merge(:service => 'riak'))
+ core_services.each do |s|
+ report(event.merge(:service => "riak #{s}"))
+ end
+ fsm_types.each do |type|
+ fsm_percentiles.each do |percentile|
+ report(event.merge(:service => "riak #{type} #{percentile}"))
+ end
+ end
+ return
+ end
+
+ # Riak itself
report(
:host => opts[:riak_host],
:service => 'riak',
:state => 'ok'
)
# Gets/puts/rr
- [
- 'vnode_gets',
- 'vnode_puts',
- 'node_gets',
- 'node_puts',
- 'read_repairs'
- ].each do |s|
+ core_services.each do |s|
report(
:host => opts[:riak_host],
:service => "riak #{s}",
:state => 'ok',
:metric => stats[s].to_i/60.0,
:description => "#{stats[s].to_i/60.0}/sec"
)
end
# FSMs
- ['get', 'put'].each do |type|
- [50, 95, 99].each do |percentile|
+ fsm_types.each do |type|
+ fsm_percentiles.each do |percentile|
val = stats[fsm_stat(type, percentile)].to_i || 0
val = 0 if val == 'undefined'
val /= 1000.0 # Convert us to ms
state = fsm_state(type, percentile, val)
report(