module CloudCrowd
  
  # The Worker, forked off from the Node when a new WorkUnit is received, 
  # launches an Action for processing. Workers will only ever receive WorkUnits
  # that they are able to handle (for which they have a corresponding action in 
  # their actions directory). If communication with the central server is 
  # interrupted, the Worker will repeatedly attempt to complete its unit -- 
  # every Worker::RETRY_WAIT seconds. Any exceptions that take place during 
  # the course of the Action will cause the Worker to mark the WorkUnit as 
  # having failed. When finished, the Worker's process exits, minimizing the
  # potential for memory leaks.
  class Worker
    
    # Wait five seconds to retry, after internal communcication errors.
    RETRY_WAIT = 5
            
    attr_reader :pid, :node, :unit, :status
    
    # A new Worker customizes itself to its WorkUnit at instantiation.
    def initialize(node, unit)
      @start_time = Time.now
      @pid        = $$
      @node       = node
      @unit       = unit
      @status     = @unit['status']
      @retry_wait = RETRY_WAIT
    end
    
    # Return output to the central server, marking the WorkUnit done.
    def complete_work_unit(result)
      keep_trying_to "complete work unit" do
        data = base_params.merge({:status => 'succeeded', :output => result})
        @node.server["/work/#{data[:id]}"].put(data)
        log "finished #{display_work_unit} in #{data[:time]} seconds"
      end
    end
    
    # Mark the WorkUnit failed, returning the exception to central.
    def fail_work_unit(exception)
      keep_trying_to "mark work unit as failed" do
        data = base_params.merge({:status => 'failed', :output => {'output' => exception.message}.to_json})
        @node.server["/work/#{data[:id]}"].put(data)
        log "failed #{display_work_unit} in #{data[:time]} seconds\n#{exception.message}\n#{exception.backtrace}"
      end
    end
    
    # We expect and require internal communication between the central server
    # and the workers to succeed. If it fails for any reason, log it, and then 
    # keep trying the same request.
    def keep_trying_to(title)
      begin
        yield
      rescue RestClient::ResourceNotFound => e
        log "work unit ##{@unit['id']} doesn't exist. discarding..."
      rescue Exception => e
        log "failed to #{title} -- retry in #{@retry_wait} seconds"
        log e.message
        log e.backtrace
        sleep @retry_wait
        retry
      end
    end
    
    # Loggable details describing what the Worker is up to.
    def display_work_unit
      "unit ##{@unit['id']} (#{@unit['action']}/#{CloudCrowd.display_status(@status)})"
    end
    
    # Executes the WorkUnit by running the Action, catching all exceptions as 
    # failures. We capture the thread so that we can kill it from the outside,
    # when exiting.
    def run_work_unit
      @worker_thread = Thread.new do
        begin
          result = nil
          action_class = CloudCrowd.actions[@unit['action']]
          action = action_class.new(@status, @unit['input'], enhanced_unit_options, @node.asset_store)
          Dir.chdir(action.work_directory) do
            result = case @status
            when PROCESSING then action.process
            when SPLITTING  then action.split
            when MERGING    then action.merge
            else raise Error::StatusUnspecified, "work units must specify their status"
            end
          end
          complete_work_unit({'output' => result}.to_json)
        rescue Exception => e
          fail_work_unit(e)
        ensure
          action.cleanup_work_directory if action
        end
      end
      @worker_thread.join
    end
    
    # Wraps run_work_unit to benchmark the execution time, if requested.
    def run
      trap_signals
      log "starting #{display_work_unit}"
      return run_work_unit unless @unit['options']['benchmark']
      log("ran #{display_work_unit} in " + Benchmark.measure { run_work_unit }.to_s)
    end
    
    # There are some potentially important attributes of the WorkUnit that we'd 
    # like to pass into the Action -- in case it needs to know them. They will 
    # always be made available in the options hash.
    def enhanced_unit_options
      @unit['options'].merge({
        'job_id'        => @unit['job_id'],
        'work_unit_id'  => @unit['id'],
        'attempts'      => @unit['attempts'] 
      })
    end
    
    # How long has this worker been running for?
    def time_taken
      Time.now - @start_time
    end
    
    
    private
    
    # Common parameters to send back to central upon unit completion, 
    # regardless of success or failure.
    def base_params
      { :pid  => @pid,
        :id   => @unit['id'], 
        :time => time_taken }
    end
    
    # Log a message to the daemon log. Includes PID for identification.
    def log(message)
      puts "Worker ##{@pid}: #{message}" unless ENV['RACK_ENV'] == 'test'
    end
    
    # When signaled to exit, make sure that the Worker shuts down cleanly.
    def trap_signals
      Signal.trap('INT') { shut_down }
      Signal.trap('KILL') { shut_down }
      Signal.trap('TERM') { shut_down }
    end
    
    # Force the Worker to quit, even if it's in the middle of processing.
    # If it had a checked-out WorkUnit, the Node should have released it on
    # the central server already.
    def shut_down
      if @worker_thread
        @worker_thread.kill
        @worker_thread.kill! if @worker_thread.alive?
      end
      Process.exit
    end
    
  end
  
end