# -*- ruby -*- #encoding: utf-8 require 'set' require 'cztop' require 'cztop/reactor' require 'cztop/reactor/signal_handling' require 'loggability' require 'arborist' unless defined?( Arborist ) require 'arborist/client' # An event-driven runner for Arborist::Monitors. class Arborist::MonitorRunner extend Loggability include CZTop::Reactor::SignalHandling # Signals the runner handles QUEUE_SIGS = [ :INT, :TERM, :HUP, :USR1, # :TODO: :QUIT, :WINCH, :USR2, :TTIN, :TTOU ] & Signal.list.keys.map( &:to_sym ) # Number of seconds between thread cleanup THREAD_CLEANUP_INTERVAL = 5 # seconds log_to :arborist ### Create a new Arborist::MonitorRunner def initialize @monitors = [] @handler = nil @reactor = CZTop::Reactor.new @client = Arborist::Client.new @runner_threads = {} @request_queue = {} end ###### public ###### ## # The Array of loaded Arborist::Monitors the runner should run. attr_reader :monitors ## # The ZMQ::Handler subclass that handles all async IO attr_accessor :handler ## # The reactor (a ZMQ::Loop) the runner uses to drive everything attr_accessor :reactor ## # The Queue of pending requests, keyed by the callback that should be called with the # results. attr_reader :request_queue ## # The Arborist::Client that will provide the message packing and unpacking attr_reader :client ## # A hash of monitor object -> thread used to contain and track running monitor threads. attr_reader :runner_threads ### Load monitors from the specified +enumerator+. def load_monitors( enumerator ) self.monitors.concat( enumerator.to_a ) end ### Run the specified +monitors+ def run self.monitors.each do |mon| self.add_timer_for( mon ) end self.add_thread_cleanup_timer self.with_signal_handler( self.reactor, *QUEUE_SIGS ) do self.reactor.register( self.client.tree_api, :write, &self.method(:handle_io_event) ) self.reactor.start_polling( ignore_interrupts: true ) end end ### Restart the runner def restart # :TODO: Kill any running monitor children, cancel monitor timers, and reload # monitors from the monitor enumerator raise NotImplementedError end ### Stop the runner. def stop self.log.info "Stopping the runner." self.reactor.stop_polling end ### Reactor callback -- handle the client's socket becoming writable. def handle_io_event( event ) if event.writable? if (( pair = self.request_queue.shift )) callback, request = *pair res = self.client.send_tree_api_request( request ) callback.call( res ) end self.unregister if self.request_queue.empty? else raise "Unexpected %p on the tree API socket" % [ event ] end end ### Update nodes with the results of a monitor's run. def run_monitor( monitor ) positive = monitor.positive_criteria negative = monitor.negative_criteria exclude_down = monitor.exclude_down? props = monitor.node_properties self.search( positive, exclude_down, props, negative ) do |nodes| self.log.info "Running %p monitor for %d node(s)" % [ monitor.description, nodes.length ] unless nodes.empty? self.runner_threads[ monitor ] = Thread.new do Thread.current[:monitor_desc] = monitor.description results = self.run_monitor_safely( monitor, nodes ) self.log.debug " updating with results: %p" % [ results ] self.update( results, monitor.key ) do self.log.debug "Updated %d via the '%s' monitor" % [ results.length, monitor.description ] end end self.log.debug "THREAD: Started %p for %p" % [ self.runner_threads[monitor], monitor ] self.log.debug "THREAD: Runner threads have: %p" % [ self.runner_threads.to_a ] end end end ### Exec +monitor+ against the provided +nodes+ hash, treating ### runtime exceptions as an error condition. Returns an update ### hash, keyed by node identifier. ### def run_monitor_safely( monitor, nodes ) results = begin monitor.run( nodes ) rescue => err errmsg = "Exception while running %p monitor: %s: %s" % [ monitor.description, err.class.name, err.message ] self.log.error "%s\n%s" % [ errmsg, err.backtrace.join("\n ") ] nodes.keys.each_with_object({}) do |id, node_results| node_results[id] = { error: errmsg } end end return results end ### Create a search request using the runner's client, then queue the request up ### with the specified +block+ as the callback. def search( criteria, exclude_down, properties, negative={}, &block ) search = self.client.make_search_request( criteria, exclude_down: exclude_down, properties: properties, exclude: negative ) self.queue_request( search, &block ) end ### Create an update request using the runner's client, then queue the request up ### with the specified +block+ as the callback. def update( nodemap, monitor_key, &block ) return if nodemap.empty? update = self.client.make_update_request( nodemap, monitor_key: monitor_key ) self.queue_request( update, &block ) end ### Add the specified +event+ to the queue to be published to the console event ### socket def queue_request( request, &callback ) self.request_queue[ callback ] = request self.register end ### Returns +true+ if the runner's client socket is currently registered for writing. def registered? return self.reactor.event_enabled?( self.client.tree_api, :write ) end ### Register the handler's pollitem as being ready to write if it isn't already. def register # self.log.debug "Registering for writing." self.reactor.enable_events( self.client.tree_api, :write ) unless self.registered? end ### Unregister the handler's pollitem from the reactor when there's nothing ready ### to write. def unregister # self.log.debug "Unregistering for writing." self.reactor.disable_events( self.client.tree_api, :write ) if self.registered? end ### Register a timer for the specified +monitor+. def add_timer_for( monitor ) if monitor.splay.nonzero? self.add_splay_timer_for( monitor ) else self.add_interval_timer_for( monitor ) end end ### Create a repeating ZMQ::Timer that will run the specified monitor on its interval. def add_interval_timer_for( monitor ) interval = monitor.interval self.log.info "Creating timer for %p" % [ monitor ] return self.reactor.add_periodic_timer( interval ) do unless self.runner_threads.key?( monitor ) self.run_monitor( monitor ) end end end ### Create a one-shot ZMQ::Timer that will register the interval timer for the specified ### +monitor+ after a random number of seconds no greater than its splay. def add_splay_timer_for( monitor ) delay = rand( monitor.splay ) self.log.debug "Splaying registration of %p for %ds" % [ monitor, delay ] self.reactor.add_oneshot_timer( delay ) do self.add_interval_timer_for( monitor ) end end ### Set up a timer to clean up monitor threads. def add_thread_cleanup_timer self.log.debug "Starting thread cleanup timer for %p." % [ self.runner_threads ] self.reactor.add_periodic_timer( THREAD_CLEANUP_INTERVAL ) do self.cleanup_monitor_threads end end ### :TODO: Handle the thread-interrupt stuff? ### Clean up any monitor runner threads that are dead. def cleanup_monitor_threads self.runner_threads.values.reject( &:alive? ).each do |thr| monitor = self.runner_threads.key( thr ) self.runner_threads.delete( monitor ) begin thr.join rescue => err self.log.error "%p while running %s: %s" % [ err.class, thr[:monitor_desc], err.message ] end end end # # :section: Signal Handling # These methods set up some behavior for starting, restarting, and stopping # the manager when a signal is received. # ### Handle signals. def handle_signal( sig ) self.log.debug "Handling signal %s" % [ sig ] case sig when :INT, :TERM self.on_termination_signal( sig ) when :HUP self.on_hangup_signal( sig ) when :USR1 self.on_user1_signal( sig ) else self.log.warn "Unhandled signal %s" % [ sig ] end end ### Handle a TERM signal. Shuts the handler down after handling any current request/s. Also ### aliased to #on_interrupt_signal. def on_termination_signal( signo ) self.log.warn "Terminated (%p)" % [ signo ] self.stop end alias_method :on_interrupt_signal, :on_termination_signal ### Handle a hangup by restarting the runner. def on_hangup_signal( signo ) self.log.warn "Hangup (%p)" % [ signo ] self.restart end end # class Arborist::MonitorRunner