Skip to content

Commit 9e77f6f

Browse files
Change reconcile/container update order on init and waitForHostResources/emitCurrentStatus order (aws#3747)
1 parent 2d30365 commit 9e77f6f

File tree

4 files changed

+28
-10
lines changed

4 files changed

+28
-10
lines changed

agent/api/task/task.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -3628,8 +3628,8 @@ func (task *Task) ToHostResources() map[string]*ecs.Resource {
36283628
"taskArn": task.Arn,
36293629
"CPU": *resources["CPU"].IntegerValue,
36303630
"MEMORY": *resources["MEMORY"].IntegerValue,
3631-
"PORTS_TCP": resources["PORTS_TCP"].StringSetValue,
3632-
"PORTS_UDP": resources["PORTS_UDP"].StringSetValue,
3631+
"PORTS_TCP": aws.StringValueSlice(resources["PORTS_TCP"].StringSetValue),
3632+
"PORTS_UDP": aws.StringValueSlice(resources["PORTS_UDP"].StringSetValue),
36333633
"GPU": *resources["GPU"].IntegerValue,
36343634
})
36353635
return resources

agent/engine/docker_task_engine.go

+8-3
Original file line numberDiff line numberDiff line change
@@ -597,15 +597,20 @@ func (engine *DockerTaskEngine) synchronizeState() {
597597
}
598598

599599
tasks := engine.state.AllTasks()
600+
// For normal task progress, overseeTask 'consume's resources through waitForHostResources in host_resource_manager before progressing
601+
// For agent restarts (state restore), we pre-consume resources for tasks that had progressed beyond waitForHostResources stage -
602+
// so these tasks do not wait during 'waitForHostResources' call again - do not go through queuing again
603+
//
604+
// Call reconcileHostResources before
605+
// - filterTasksToStartUnsafe which will reconcile container statuses for the duration the agent was stopped
606+
// - starting managedTask's overseeTask goroutines
607+
engine.reconcileHostResources()
600608
tasksToStart := engine.filterTasksToStartUnsafe(tasks)
601609
for _, task := range tasks {
602610
task.InitializeResources(engine.resourceFields)
603611
engine.saveTaskData(task)
604612
}
605613

606-
// Before starting managedTask goroutines, pre-allocate resources for tasks which
607-
// which have progressed beyond resource check (waitingTaskQueue) stage
608-
engine.reconcileHostResources()
609614
for _, task := range tasksToStart {
610615
engine.startTask(task)
611616
}

agent/engine/host_resource_manager.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/aws/amazon-ecs-agent/ecs-agent/ecs_client/model/ecs"
2424
"github.com/aws/amazon-ecs-agent/ecs-agent/logger"
2525
"github.com/aws/amazon-ecs-agent/ecs-agent/logger/field"
26+
"github.com/aws/aws-sdk-go/aws"
2627
)
2728

2829
const (
@@ -72,8 +73,8 @@ func (h *HostResourceManager) logResources(msg string, taskArn string) {
7273
"taskArn": taskArn,
7374
"CPU": *h.consumedResource[CPU].IntegerValue,
7475
"MEMORY": *h.consumedResource[MEMORY].IntegerValue,
75-
"PORTS_TCP": h.consumedResource[PORTSTCP].StringSetValue,
76-
"PORTS_UDP": h.consumedResource[PORTSUDP].StringSetValue,
76+
"PORTS_TCP": aws.StringValueSlice(h.consumedResource[PORTSTCP].StringSetValue),
77+
"PORTS_UDP": aws.StringValueSlice(h.consumedResource[PORTSUDP].StringSetValue),
7778
"GPU": *h.consumedResource[GPU].IntegerValue,
7879
})
7980
}

agent/engine/task_manager.go

+15-3
Original file line numberDiff line numberDiff line change
@@ -199,12 +199,17 @@ func (mtask *managedTask) overseeTask() {
199199
// `desiredstatus`es which are a construct of the engine used only here,
200200
// not present on the backend
201201
mtask.UpdateStatus()
202-
// If this was a 'state restore', send all unsent statuses
203-
mtask.emitCurrentStatus()
204202

205-
// Wait for host resources required by this task to become available
203+
// Wait here until enough resources are available on host for the task to progress
204+
// - Waits until host resource manager succesfully 'consume's task resources and returns
205+
// - For tasks which have crossed this stage before (on agent restarts), resources are pre-consumed - returns immediately
206+
// - If the task is already stopped (knownStatus is STOPPED), does not attempt to consume resources - returns immediately
207+
// (resources are later 'release'd on Stopped task emitTaskEvent call)
206208
mtask.waitForHostResources()
207209

210+
// If this was a 'state restore', send all unsent statuses
211+
mtask.emitCurrentStatus()
212+
208213
// Main infinite loop. This is where we receive messages and dispatch work.
209214
for {
210215
if mtask.shouldExit() {
@@ -272,6 +277,13 @@ func (mtask *managedTask) emitCurrentStatus() {
272277
// the task. It will wait for event on this task's consumedHostResourceEvent
273278
// channel from monitorQueuedTasks routine to wake up
274279
func (mtask *managedTask) waitForHostResources() {
280+
if mtask.GetKnownStatus().Terminal() {
281+
// Task's known status is STOPPED. No need to wait in this case and proceed to cleanup
282+
// This is relevant when agent restarts and a task has stopped - do not attempt
283+
// to consume resources in host resource manager
284+
return
285+
}
286+
275287
if !mtask.IsInternal && !mtask.engine.hostResourceManager.checkTaskConsumed(mtask.Arn) {
276288
// Internal tasks are started right away as their resources are not accounted for
277289
mtask.engine.enqueueTask(mtask)

0 commit comments

Comments
 (0)