lib/hako/schedulers/ecs.rb in hako-1.1.0 vs lib/hako/schedulers/ecs.rb in hako-1.2.0
- old
+ new
@@ -1,6 +1,7 @@
# frozen_string_literal: true
+
require 'aws-sdk'
require 'hako'
require 'hako/error'
require 'hako/scheduler'
require 'hako/schedulers/ecs_autoscaling'
@@ -645,10 +646,12 @@
started_at =
if @timeout
Process.clock_gettime(Process::CLOCK_MONOTONIC)
end
+ started_task_ids = []
+
loop do
if started_at
if Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at > @timeout
Hako.logger.error('Timed out')
return false
@@ -664,15 +667,26 @@
s.events.each do |e|
if e.id == latest_event_id
break
end
Hako.logger.info "#{e.created_at}: #{e.message}"
+ task_id = extract_task_id(e.message)
+ if task_id && e.message.include?(' has started ')
+ started_task_ids << task_id
+ end
end
latest_event_id = find_latest_event_id(s.events)
Hako.logger.debug " latest_event_id=#{latest_event_id}, deployments=#{s.deployments}"
no_active = s.deployments.all? { |d| d.status != 'ACTIVE' }
primary = s.deployments.find { |d| d.status == 'PRIMARY' }
+ if primary.desired_count < started_task_ids.size
+ Hako.logger.error('Some started tasks are stopped. It seems new deployment is failing to start')
+ ecs_client.describe_tasks(cluster: service.cluster_arn, tasks: started_task_ids).tasks.each do |task|
+ report_task_diagnostics(task)
+ end
+ return false
+ end
primary_ready = primary && primary.running_count == primary.desired_count
if no_active && primary_ready
return true
else
sleep 1
@@ -685,9 +699,26 @@
def find_latest_event_id(events)
if events.empty?
nil
else
events[0].id
+ end
+ end
+
+ TASK_ID_RE = /\(task ([\h-]+)\)\.\z/
+ # @param [String] message
+ # @return [String, nil]
+ def extract_task_id(message)
+ message.slice(TASK_ID_RE, 1)
+ end
+
+ # @param [Aws::ECS::Types::Task] task
+ # @return [nil]
+ def report_task_diagnostics(task)
+ Hako.logger.error("task_definition_arn=#{task.task_definition_arn} last_status=#{task.last_status}")
+ Hako.logger.error(" stopped_reason: #{task.stopped_reason}")
+ task.containers.sort_by(&:name).each do |container|
+ Hako.logger.error(" Container #{container.name}: last_status=#{container.last_status} exit_code=#{container.exit_code.inspect} reason=#{container.reason.inspect}")
end
end
# @param [Aws::ECS::Types::TaskDefinition]
# @return [String]