# -*- ruby -*- # vim: set noet nosta sw=4 ts=4 : # # Get the current WAL segment and offset from a master postgresql # server, and compare slave servers to see how far behind they # are in MB. This script should be easily modified for use with # Nagios/Mon/Monit/Zabbix/whatever, or wrapping it in a display loop, # and is suitable for both WAL shipping or streaming forms of replication. # # Mahlon E. Smith # # First argument is the master server, all other arguments are treated # as slave machines. # # db_replication.monitor db-master.example.com ... # require 'ostruct' require 'optparse' require 'pathname' require 'etc' require 'pg' require 'pp' ### A class to encapsulate the PG handles. ### class PGMonitor VERSION = %q$Id$ # When to consider a slave as 'behind', measured in WAL segments. # The default WAL segment size is 16, so we'll alert after # missing two WAL files worth of data. # LAG_ALERT = 32 ### Create a new PGMonitor object. ### def initialize( opts, hosts ) @opts = opts @master = hosts.shift @slaves = hosts @current_wal = {} @failures = [] end attr_reader :opts, :current_wal, :master, :slaves, :failures ### Perform the connections and check the lag. ### def check # clear prior failures, get current xlog info @failures = [] return unless self.get_current_wal # check all slaves self.slaves.each do |slave| begin slave_db = PG.connect( :dbname => self.opts.database, :host => slave, :port => self.opts.port, :user => self.opts.user, :password => self.opts.pass, :sslmode => 'prefer' ) xlog = slave_db.exec( 'SELECT pg_last_xlog_receive_location()' ).getvalue( 0, 0 ) slave_db.close lag_in_megs = ( self.find_lag( xlog ).to_f / 1024 / 1024 ).abs if lag_in_megs >= LAG_ALERT failures << { :host => slave, :error => "%0.2fMB behind the master." % [ lag_in_megs ] } end rescue => err failures << { :host => slave, :error => err.message } end end end ######### protected ######### ### Ask the master for the current xlog information, to compare ### to slaves. Returns true on success. On failure, populates ### the failures array and returns false. ### def get_current_wal master_db = PG.connect( :dbname => self.opts.database, :host => self.master, :port => self.opts.port, :user => self.opts.user, :password => self.opts.pass, :sslmode => 'prefer' ) self.current_wal[ :segbytes ] = master_db.exec( 'SHOW wal_segment_size' ). getvalue( 0, 0 ).sub( /\D+/, '' ).to_i << 20 current = master_db.exec( 'SELECT pg_current_xlog_location()' ).getvalue( 0, 0 ) self.current_wal[ :segment ], self.current_wal[ :offset ] = current.split( /\// ) master_db.close return true # If we can't get any of the info from the master, then there is no # point in a comparison with slaves. # rescue => err self.failures << { :host => self.master, :error => 'Unable to retrieve required info from the master (%s)' % [ err.message ] } return false end ### Given an +xlog+ position from a slave server, return ### the number of bytes the slave needs to replay before it ### is caught up to the master. ### def find_lag( xlog ) s_segment, s_offset = xlog.split( /\// ) m_segment = self.current_wal[ :segment ] m_offset = self.current_wal[ :offset ] m_segbytes = self.current_wal[ :segbytes ] return (( m_segment.hex - s_segment.hex ) * m_segbytes) + ( m_offset.hex - s_offset.hex ) end end ### Parse command line arguments. Return a struct of global options. ### def parse_args( args ) options = OpenStruct.new options.database = 'postgres' options.port = 5432 options.user = Etc.getpwuid( Process.uid ).name options.sslmode = 'prefer' opts = OptionParser.new do |opts| opts.banner = "Usage: #{$0} [options] [slave2, slave3...]" opts.separator '' opts.separator 'Connection options:' opts.on( '-d', '--database DBNAME', "specify the database to connect to (default: \"#{options.database}\")" ) do |db| options.database = db end opts.on( '-h', '--host HOSTNAME', 'database server host' ) do |host| options.host = host end opts.on( '-p', '--port PORT', Integer, "database server port (default: \"#{options.port}\")" ) do |port| options.port = port end opts.on( '-U', '--user NAME', "database user name (default: \"#{options.user}\")" ) do |user| options.user = user end opts.on( '-W', 'force password prompt' ) do |pw| print 'Password: ' begin system 'stty -echo' options.pass = $stdin.gets.chomp ensure system 'stty echo' puts end end opts.separator '' opts.separator 'Other options:' opts.on_tail( '--help', 'show this help, then exit' ) do $stderr.puts opts exit end opts.on_tail( '--version', 'output version information, then exit' ) do puts PGMonitor::VERSION exit end end opts.parse!( args ) return options end if __FILE__ == $0 opts = parse_args( ARGV ) raise ArgumentError, "At least two PostgreSQL servers are required." if ARGV.length < 2 mon = PGMonitor.new( opts, ARGV ) mon.check if mon.failures.empty? puts "All is well!" exit 0 else puts "Database replication delayed or broken." mon.failures.each do |bad| puts "%s: %s" % [ bad[ :host ], bad[ :error ] ] end exit 1 end end