tools/riemann-riak/bin/riemann-riak in riemann-tools-1.0.0 vs tools/riemann-riak/bin/riemann-riak in riemann-tools-1.1.0
- old
+ new
@@ -1,331 +1,323 @@
#!/usr/bin/env ruby
-Process.setproctitle($0)
+# frozen_string_literal: true
+require 'English'
+Process.setproctitle($PROGRAM_NAME)
+
# Forwards information on a Riak node to Riemann.
require 'riemann/tools'
-class Riemann::Tools::Riak
- include Riemann::Tools
- require 'net/http'
- require 'net/https'
- require 'yajl/json_gem'
+module Riemann
+ module Tools
+ class Riak
+ include Riemann::Tools
+ require 'net/http'
+ require 'net/https'
+ require 'yajl/json_gem'
- opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
- opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
- opt :stats_port, "Riak HTTP port for stats", :default => 8098
- opt :stats_path, "Riak HTTP stats path", :default => '/stats'
- opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
- opt :cookie, "Riak cookie to use", :default => "riak"
+ opt :riak_host, 'Riak host for stats <IP> or SSL http(s)://<IP>', default: Socket.gethostname
+ opt :data_dir, 'Riak data directory', default: '/var/lib/riak'
+ opt :stats_port, 'Riak HTTP port for stats', default: 8098
+ opt :stats_path, 'Riak HTTP stats path', default: '/stats'
+ opt :node_name, 'Riak erlang node name', default: "riak@#{Socket.gethostname}"
+ opt :cookie, 'Riak cookie to use', default: 'riak'
- opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
- opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
- opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
- opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
- opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
- opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
+ opt :get_50_warning, 'FSM 50% get time warning threshold (ms)', default: 1000
+ opt :put_50_warning, 'FSM 50% put time warning threshold (ms)', default: 1000
+ opt :get_95_warning, 'FSM 95% get time warning threshold (ms)', default: 2000
+ opt :put_95_warning, 'FSM 95% put time warning threshold (ms)', default: 2000
+ opt :get_99_warning, 'FSM 99% get time warning threshold (ms)', default: 10_000
+ opt :put_99_warning, 'FSM 99% put time warning threshold (ms)', default: 10_000
- def initialize
- detect_features
+ def initialize
+ detect_features
- @httpstatus = true
+ @httpstatus = true
- begin
- uri = URI.parse(opts[:riak_host])
- if uri.host == nil
- uri.host = opts[:riak_host]
+ begin
+ uri = URI.parse(opts[:riak_host])
+ uri.host = opts[:riak_host] if uri.host.nil?
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
+ http.use_ssl = uri.scheme == 'https'
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE if http.use_ssl?
+ http.start do |h|
+ h.get opts[:stats_path]
+ end
+ rescue StandardError => _e
+ @httpstatus = false
+ end
+
+ # we're going to override the emulator setting to allow users to
+ # dynamically input the cookie
+ # this is done only once - hopefully it doesn't get overridden.
+ ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
end
- http = Net::HTTP.new(uri.host, opts[:stats_port])
- http.use_ssl = uri.scheme == 'https'
- if http.use_ssl?
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
- end
- http.start do |h|
- h.get opts[:stats_path]
- end
- rescue => _e
- @httpstatus = false
- end
- # we're going to override the emulator setting to allow users to
- # dynamically input the cookie
- # this is done only once - hopefully it doesn't get overridden.
- ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
- end
+ # Identifies whether escript and riak-admin are installed
+ def detect_features
+ @escript = true # Whether escript is present on this machine
+ @riakadmin = true # Whether riak-admin is present
- # Identifies whether escript and riak-admin are installed
- def detect_features
- @escript = true # Whether escript is present on this machine
- @riakadmin = true # Whether riak-admin is present
+ @escript = false if `which escript` =~ /^\s*$/
- if `which escript` =~ /^\s*$/
- @escript = false
- end
+ @riakadmin = false if `which riak-admin` =~ /^\s*$/
+ end
- if `which riak-admin` =~ /^\s*$/
- @riakadmin = false
- end
- end
+ def check_ring
+ str = if @escript
+ `#{__dir__}/riemann-riak-ring #{opts[:node_name]}`.chomp
+ elsif @riakadmin
+ `riak-admin ringready`
+ end
- def check_ring
- str = if @escript
- str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
- elsif @riakadmin
- str = `riak-admin ringready`
- else
- nil
- end
+ return if str.nil?
- return if str.nil?
+ if str =~ /^TRUE/
+ report(
+ host: opts[:riak_host],
+ service: 'riak ring',
+ state: 'ok',
+ description: str,
+ )
+ else
+ report(
+ host: opts[:riak_host],
+ service: 'riak ring',
+ state: 'warning',
+ description: str,
+ )
+ end
+ end
- if str =~ /^TRUE/
- report(
- :host => opts[:riak_host],
- :service => 'riak ring',
- :state => 'ok',
- :description => str
- )
- else
- report(
- :host => opts[:riak_host],
- :service => 'riak ring',
- :state => 'warning',
- :description => str
- )
- end
- end
+ def check_keys
+ keys = `#{__dir__}/riemann-riak-keys #{opts[:node_name]}`.chomp
+ if keys =~ /^\d+$/
+ report(
+ host: opts[:riak_host],
+ service: 'riak keys',
+ state: 'ok',
+ metric: keys.to_i,
+ description: keys,
+ )
+ else
+ report(
+ host: opts[:riak_host],
+ service: 'riak keys',
+ state: 'unknown',
+ description: keys,
+ )
+ end
+ end
- def check_keys
- keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
- if keys =~ /^\d+$/
- report(
- :host => opts[:riak_host],
- :service => 'riak keys',
- :state => 'ok',
- :metric => keys.to_i,
- :description => keys
- )
- else
- report(
- :host => opts[:riak_host],
- :service => 'riak keys',
- :state => 'unknown',
- :description => keys
- )
- end
- end
+ def check_transfers
+ str = (`riak-admin transfers` if @riakadmin)
- def check_transfers
- str = if @riakadmin
- `riak-admin transfers`
- else
- nil
- end
+ return if str.nil?
- return if str.nil?
+ if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
+ report(
+ host: opts[:riak_host],
+ service: 'riak transfers',
+ state: 'critical',
+ metric: Regexp.last_match(1).to_i,
+ description: "waiting to handoff #{Regexp.last_match(1)} partitions",
+ )
+ else
+ report(
+ host: opts[:riak_host],
+ service: 'riak transfers',
+ state: 'ok',
+ metric: 0,
+ description: 'No pending transfers',
+ )
+ end
+ end
- if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
- report(
- :host => opts[:riak_host],
- :service => 'riak transfers',
- :state => 'critical',
- :metric => $1.to_i,
- :description => "waiting to handoff #{$1} partitions"
- )
- else
- report(
- :host => opts[:riak_host],
- :service => 'riak transfers',
- :state => 'ok',
- :metric => 0,
- :description => "No pending transfers"
- )
- end
- end
+ def check_disk
+ gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
+ report(
+ host: opts[:riak_host],
+ service: 'riak disk',
+ state: 'ok',
+ metric: gb,
+ description: "#{gb} GB in #{opts[:data_dir]}",
+ )
+ end
- def check_disk
- gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
- report(
- :host => opts[:riak_host],
- :service => 'riak disk',
- :state => 'ok',
- :metric => gb,
- :description => "#{gb} GB in #{opts[:data_dir]}"
- )
- end
+ # Returns the riak stat for the given fsm type and percentile.
+ def fsm_stat(type, property, percentile)
+ "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
+ end
- # Returns the riak stat for the given fsm type and percentile.
- def fsm_stat(type, property, percentile)
- "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
- end
+ # Returns the alerts state for the given fsm.
+ def fsm_state(type, percentile, val)
+ limit = opts["#{type}_#{percentile}_warning".to_sym]
+ case val
+ when 0..limit
+ 'ok'
+ when limit..limit * 2
+ 'warning'
+ else
+ 'critical'
+ end
+ end
- # Returns the alerts state for the given fsm.
- def fsm_state(type, percentile, val)
- limit = opts["#{type}_#{percentile}_warning".to_sym]
- case val
- when 0 .. limit
- 'ok'
- when limit .. limit * 2
- 'warning'
- else
- 'critical'
- end
- end
+ # Get current stats via HTTP
+ def stats_http
+ begin
+ uri = URI.parse(opts[:riak_host])
+ uri.host = opts[:riak_host] if uri.host.nil?
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
+ http.use_ssl = uri.scheme == 'https'
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE if http.use_ssl?
+ res = http.start do |h|
+ h.get opts[:stats_path]
+ end
+ rescue StandardError => e
+ report(
+ host: opts[:riak_host],
+ service: 'riak',
+ state: 'critical',
+ description: "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}",
+ )
+ raise
+ end
- # Get current stats via HTTP
- def stats_http
- begin
- uri = URI.parse(opts[:riak_host])
- if uri.host == nil
- uri.host = opts[:riak_host]
+ if res.code.to_i == 200
+ JSON.parse(res.body)
+ else
+ report(
+ host: opts[:riak_host],
+ service: 'riak',
+ state: 'critical',
+ description: "stats returned HTTP #{res.code}:\n\n#{res.body}",
+ )
+ raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
+ end
end
- http = Net::HTTP.new(uri.host, opts[:stats_port])
- http.use_ssl = uri.scheme == 'https'
- if http.use_ssl?
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
- end
- res = http.start do |h|
- h.get opts[:stats_path]
- end
- rescue => e
- report(
- :host => opts[:riak_host],
- :service => 'riak',
- :state => 'critical',
- :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
- )
- raise
- end
- if res.code.to_i == 200
- return JSON.parse(res.body)
- else
- report(
- :host => opts[:riak_host],
- :service => 'riak',
- :state => 'critical',
- :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
- )
- raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
- end
- end
+ # Get current stats via riak-admin
+ def stats_riak_admin
+ str = `riak-admin status`
+ raise 'riak-admin failed' unless $CHILD_STATUS == 0
- # Get current stats via riak-admin
- def stats_riak_admin
- str = `riak-admin status`
- raise "riak-admin failed" unless $? == 0
- Hash[str.split(/\n/).map{|i| i.split(/ : /)}]
- end
+ Hash[str.split(/\n/).map { |i| i.split(/ : /) }]
+ end
- # Get current stats as a hash
- def stats
- if @httpstatus
- stats_http
- elsif @riakadmin
- stats_riak_admin
- else
- report(
- :host => opts[:riak_host],
- :service => 'riak',
- :state => 'critical',
- :description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
- )
- raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
- end
- end
+ # Get current stats as a hash
+ def stats
+ if @httpstatus
+ stats_http
+ elsif @riakadmin
+ stats_riak_admin
+ else
+ report(
+ host: opts[:riak_host],
+ service: 'riak',
+ state: 'critical',
+ description: 'No mechanism for fetching Riak stats: neither HTTP nor riak-admin available.',
+ )
+ raise 'No mechanism for fetching Riak stats: neither HTTP nor riak-admin available.'
+ end
+ end
- def core_services
- ['vnode_gets',
- 'vnode_puts',
- 'node_gets',
- 'node_puts',
- 'node_gets_set',
- 'node_puts_set',
- 'read_repairs']
- end
+ def core_services
+ %w[vnode_gets
+ vnode_puts
+ node_gets
+ node_puts
+ node_gets_set
+ node_puts_set
+ read_repairs]
+ end
- def fsm_types
- [{'get' => 'time'}, {'put' => 'time'},
- {'get' => 'set_objsize'}]
- end
+ def fsm_types
+ [{ 'get' => 'time' }, { 'put' => 'time' },
+ { 'get' => 'set_objsize' },]
+ end
- def fsm_percentiles
- [50, 95, 99]
- end
-
- # Reports current stats to Riemann
- def check_stats
- begin
- stats = self.stats
- rescue => e
- event = {:state => 'critical',
- :description => e.message,
- :host => opts[:riak_host]}
- # Report errors
- report(event.merge(:service => 'riak'))
- core_services.each do |s|
- report(event.merge(:service => "riak #{s}"))
+ def fsm_percentiles
+ [50, 95, 99]
end
- fsm_types.each do |typespec|
- typespec.each do |type, prop|
- fsm_percentiles.each do |percentile|
- report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
+
+ # Reports current stats to Riemann
+ def check_stats
+ begin
+ stats = self.stats
+ rescue StandardError => e
+ event = {
+ state: 'critical',
+ description: e.message,
+ host: opts[:riak_host],
+ }
+ # Report errors
+ report(event.merge(service: 'riak'))
+ core_services.each do |s|
+ report(event.merge(service: "riak #{s}"))
end
+ fsm_types.each do |typespec|
+ typespec.each do |type, prop|
+ fsm_percentiles.each do |percentile|
+ report(event.merge(service: "riak #{type} #{prop} #{percentile}"))
+ end
+ end
+ end
+ return
end
- end
- return
- end
- # Riak itself
- report(
- :host => opts[:riak_host],
- :service => 'riak',
- :state => 'ok'
- )
+ # Riak itself
+ report(
+ host: opts[:riak_host],
+ service: 'riak',
+ state: 'ok',
+ )
- # Gets/puts/rr
- core_services.each do |s|
- report(
- :host => opts[:riak_host],
- :service => "riak #{s}",
- :state => 'ok',
- :metric => stats[s].to_i/60.0,
- :description => "#{stats[s].to_i/60.0}/sec"
- )
- end
-
- # FSMs
- fsm_types.each do |typespec|
- typespec.each do |type, prop|
- fsm_percentiles.each do |percentile|
- val = stats[fsm_stat(type, prop, percentile)].to_i || 0
- val = 0 if val == 'undefined'
- val /= 1000.0 if prop == 'time' # Convert us to ms
- if prop == 'time'
- state = fsm_state(type, percentile, val)
- else
- state = "ok"
- end
+ # Gets/puts/rr
+ core_services.each do |s|
report(
- :host => opts[:riak_host],
- :service => "riak #{type} #{prop} #{percentile}",
- :state => state,
- :metric => val,
- :description => "#{val} ms"
+ host: opts[:riak_host],
+ service: "riak #{s}",
+ state: 'ok',
+ metric: stats[s].to_i / 60.0,
+ description: "#{stats[s].to_i / 60.0}/sec",
)
end
+
+ # FSMs
+ fsm_types.each do |typespec|
+ typespec.each do |type, prop|
+ fsm_percentiles.each do |percentile|
+ val = stats[fsm_stat(type, prop, percentile)].to_i || 0
+ val = 0 if val == 'undefined'
+ val /= 1000.0 if prop == 'time' # Convert us to ms
+ state = if prop == 'time'
+ fsm_state(type, percentile, val)
+ else
+ 'ok'
+ end
+ report(
+ host: opts[:riak_host],
+ service: "riak #{type} #{prop} #{percentile}",
+ state: state,
+ metric: val,
+ description: "#{val} ms",
+ )
+ end
+ end
+ end
end
- end
- end
- def tick
- # This can utterly destroy a cluster, so we disable
- # check_keys
- check_stats
- check_ring
- check_disk
- check_transfers
+ def tick
+ # This can utterly destroy a cluster, so we disable
+ # check_keys
+ check_stats
+ check_ring
+ check_disk
+ check_transfers
+ end
+ end
end
end
Riemann::Tools::Riak.run