lib/hako/schedulers/ecs.rb in hako-1.3.0 vs lib/hako/schedulers/ecs.rb in hako-1.3.1

- old
+ new

@@ -813,52 +813,65 @@ end raise "Unable to find rollback target. #{task_definition.task_definition_arn} is INACTIVE?" end + MIN_ASG_INTERVAL = 1 + MAX_ASG_INTERVAL = 120 # @param [Aws::ECS::Types::TaskDefinition] task_definition # @return [Boolean] true if the capacity is reserved def on_no_tasks_started(task_definition) unless @autoscaling_group_for_oneshot return false end autoscaling = Aws::AutoScaling::Client.new + interval = MIN_ASG_INTERVAL loop do - asg = autoscaling.describe_auto_scaling_groups(auto_scaling_group_names: [@autoscaling_group_for_oneshot]).auto_scaling_groups[0] + begin + asg = autoscaling.describe_auto_scaling_groups(auto_scaling_group_names: [@autoscaling_group_for_oneshot]).auto_scaling_groups[0] + rescue Aws::AutoScaling::Errors::Throttling => e + Hako.logger.error(e) + interval = [interval * 2, MAX_ASG_INTERVAL].min + Hako.logger.info("Retrying after #{interval} seconds...") + sleep interval + next + end unless asg raise Error.new("AutoScaling Group '#{@autoscaling_group_for_oneshot}' does not exist") end container_instances = ecs_client.list_container_instances(cluster: @cluster).flat_map { |c| ecs_client.describe_container_instances(cluster: @cluster, container_instances: c.container_instance_arns).container_instances } if has_capacity?(task_definition, container_instances) Hako.logger.info("There's remaining capacity. Start retrying...") return true end + interval = [interval / 2, MIN_ASG_INTERVAL].max # Check autoscaling group health current = asg.instances.count { |i| i.lifecycle_state == 'InService' } if asg.desired_capacity != current - Hako.logger.debug("#{asg.auto_scaling_group_name} isn't in desired state. desired_capacity=#{asg.desired_capacity} in-service instances=#{current}") - sleep 1 + Hako.logger.debug("#{asg.auto_scaling_group_name} isn't in desired state. desired_capacity=#{asg.desired_capacity} in-service instances=#{current}. Retry after #{interval} seconds") + sleep interval next end # Check out-of-service instances out_instances = asg.instances.map(&:instance_id) container_instances.each do |ci| out_instances.delete(ci.ec2_instance_id) end unless out_instances.empty? - Hako.logger.debug("There's instances that is running but not registered as container instances: #{out_instances}") - sleep 1 + Hako.logger.debug("There's instances that is running but not registered as container instances: #{out_instances}. Retry after #{interval} seconds") + sleep interval next end # Scale out desired = current + 1 Hako.logger.info("Increment desired_capacity of #{asg.auto_scaling_group_name} from #{current} to #{desired}") autoscaling.set_desired_capacity(auto_scaling_group_name: asg.auto_scaling_group_name, desired_capacity: desired) + sleep interval end end # @param [Aws::ECS::Types::TaskDefinition] task_definition # @param [Array<Aws::ECS::Types::ContainerInstance>] container_instances