From 481907e5cd8c9ce7577f44d225a7c7911bd0af3a Mon Sep 17 00:00:00 2001 From: Angel Velazquez Date: Mon, 16 May 2022 15:43:35 -0700 Subject: [PATCH 1/6] Stop container when dependencies are non-resolvable --- agent/engine/dependencygraph/graph.go | 52 ++++++++++++++-------- agent/engine/task_manager.go | 62 +++++++++++++++++++++++---- 2 files changed, 87 insertions(+), 27 deletions(-) diff --git a/agent/engine/dependencygraph/graph.go b/agent/engine/dependencygraph/graph.go index 4444a0dcb98..73276068170 100644 --- a/agent/engine/dependencygraph/graph.go +++ b/agent/engine/dependencygraph/graph.go @@ -47,24 +47,40 @@ const ( var ( // CredentialsNotResolvedErr is the error where a container needs to wait for // credentials before it can process by agent - CredentialsNotResolvedErr = errors.New("dependency graph: container execution credentials not available") + CredentialsNotResolvedErr = &DependencyError{err: errors.New("dependency graph: container execution credentials not available")} // DependentContainerNotResolvedErr is the error where a dependent container isn't in expected state - DependentContainerNotResolvedErr = errors.New("dependency graph: dependent container not in expected state") + DependentContainerNotResolvedErr = &DependencyError{err: errors.New("dependency graph: dependent container not in expected state")} // ContainerPastDesiredStatusErr is the error where the container status is bigger than desired status - ContainerPastDesiredStatusErr = errors.New("container transition: container status is equal or greater than desired status") + ContainerPastDesiredStatusErr = &DependencyError{err: errors.New("container transition: container status is equal or greater than desired status")} // ErrContainerDependencyNotResolved is when the container's dependencies // on other containers are not resolved - ErrContainerDependencyNotResolved = errors.New("dependency graph: dependency on containers not resolved") + ErrContainerDependencyNotResolved = &DependencyError{err: errors.New("dependency graph: dependency on containers not resolved")} // ErrResourceDependencyNotResolved is when the container's dependencies // on task resources are not resolved - ErrResourceDependencyNotResolved = errors.New("dependency graph: dependency on resources not resolved") + ErrResourceDependencyNotResolved = &DependencyError{err: errors.New("dependency graph: dependency on resources not resolved")} // ResourcePastDesiredStatusErr is the error where the task resource known status is bigger than desired status - ResourcePastDesiredStatusErr = errors.New("task resource transition: task resource status is equal or greater than desired status") + ResourcePastDesiredStatusErr = &DependencyError{err: errors.New("task resource transition: task resource status is equal or greater than desired status")} // ErrContainerDependencyNotResolvedForResource is when the resource's dependencies // on other containers are not resolved - ErrContainerDependencyNotResolvedForResource = errors.New("dependency graph: resource's dependency on containers not resolved") + ErrContainerDependencyNotResolvedForResource = &DependencyError{err: errors.New("dependency graph: resource's dependency on containers not resolved")} ) +// DependencyError represents an error of a container dependency. These errors can be either terminal or non-terminal. +// Terminal dependency errors indicate that a given dependency can never be fulfilled (e.g. a container with a SUCCESS +// dependency has stopped with an exit code other than zero). +type DependencyError struct { + err error + isTerminal bool +} + +func (de *DependencyError) Error() string { + return de.err.Error() +} + +func (de *DependencyError) IsTerminal() bool { + return de.isTerminal +} + // ValidDependencies takes a task and verifies that it is possible to allow all // containers within it to reach the desired status by proceeding in some // order. @@ -124,7 +140,7 @@ func DependenciesAreResolved(target *apicontainer.Container, id string, manager credentials.Manager, resources []taskresource.TaskResource, - cfg *config.Config) (*apicontainer.DependsOn, error) { + cfg *config.Config) (*apicontainer.DependsOn, *DependencyError) { if !executionCredentialsResolved(target, id, manager) { return nil, CredentialsNotResolvedErr } @@ -244,7 +260,7 @@ func verifyStatusResolvable(target *apicontainer.Container, existingContainers m // (map from name to container). The `resolves` function passed should return true if the named container is resolved. func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container, - cfg *config.Config, resolves func(*apicontainer.Container, *apicontainer.Container, string, *config.Config) bool) (*apicontainer.DependsOn, error) { + cfg *config.Config, resolves func(*apicontainer.Container, *apicontainer.Container, string, *config.Config) bool) (*apicontainer.DependsOn, *DependencyError) { targetGoal := target.GetDesiredStatus() targetKnown := target.GetKnownStatus() @@ -260,7 +276,7 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi for _, dependency := range targetDependencies { dependencyContainer, ok := existingContainers[dependency.ContainerName] if !ok { - return nil, fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] does not exist.", dependencyContainer, target) + return nil, &DependencyError{err: fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] does not exist.", dependencyContainer, target), isTerminal: true} } // We want to check whether the dependency container has timed out only if target has not been created yet. @@ -268,7 +284,7 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi // However, if dependency container has already stopped, then it cannot time out. if targetKnown < apicontainerstatus.ContainerCreated && dependencyContainer.GetKnownStatus() != apicontainerstatus.ContainerStopped { if hasDependencyTimedOut(dependencyContainer, dependency.Condition) { - return nil, fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] has timed out.", dependencyContainer, target) + return nil, &DependencyError{err: fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] has timed out.", dependencyContainer, target), isTerminal: true} } } @@ -276,13 +292,13 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi // can then never progress to its desired state when the dependency condition is 'SUCCESS' if dependency.Condition == successCondition && dependencyContainer.GetKnownStatus() == apicontainerstatus.ContainerStopped && !hasDependencyStoppedSuccessfully(dependencyContainer) { - return nil, fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] as dependency did not exit successfully.", dependencyContainer, target) + return nil, &DependencyError{err: fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] as dependency did not exit successfully.", dependencyContainer, target), isTerminal: true} } // For any of the dependency conditions - START/COMPLETE/SUCCESS/HEALTHY, if the dependency container has // not started and will not start in the future, this dependency can never be resolved. if dependencyContainer.HasNotAndWillNotStart() { - return nil, fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] because dependency will never start", dependencyContainer, target) + return nil, &DependencyError{err: fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] because dependency will never start", dependencyContainer, target), isTerminal: true} } if !resolves(target, dependencyContainer, dependency.Condition, cfg) { @@ -290,14 +306,14 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi } } if blockedDependency != nil { - return blockedDependency, fmt.Errorf("dependency graph: failed to resolve the container ordering dependency [%v] for target [%v]", blockedDependency, target) + return blockedDependency, &DependencyError{err: fmt.Errorf("dependency graph: failed to resolve the container ordering dependency [%v] for target [%v]", blockedDependency, target)} } return nil, nil } func verifyTransitionDependenciesResolved(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container, - existingResources map[string]taskresource.TaskResource) error { + existingResources map[string]taskresource.TaskResource) *DependencyError { if !verifyContainerDependenciesResolved(target, existingContainers) { return ErrContainerDependencyNotResolved @@ -471,7 +487,7 @@ func verifyContainerOrderingStatus(dependsOnContainer *apicontainer.Container) b dependsOnContainerDesiredStatus == dependsOnContainer.GetSteadyStateStatus() } -func verifyShutdownOrder(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container) error { +func verifyShutdownOrder(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container) *DependencyError { // We considered adding this to the task state, but this will be at most 45 loops, // so we err'd on the side of having less state. missingShutdownDependencies := []string{} @@ -493,8 +509,8 @@ func verifyShutdownOrder(target *apicontainer.Container, existingContainers map[ return nil } - return fmt.Errorf("dependency graph: target %s needs other containers stopped before it can stop: [%s]", - target.Name, strings.Join(missingShutdownDependencies, "], [")) + return &DependencyError{err: fmt.Errorf("dependency graph: target %s needs other containers stopped before it can stop: [%s]", + target.Name, strings.Join(missingShutdownDependencies, "], ["))} } func onSteadyStateCanResolve(target *apicontainer.Container, run *apicontainer.Container) bool { diff --git a/agent/engine/task_manager.go b/agent/engine/task_manager.go index a0ea9274b96..55e781f6b01 100644 --- a/agent/engine/task_manager.go +++ b/agent/engine/task_manager.go @@ -87,7 +87,7 @@ type containerTransition struct { nextState apicontainerstatus.ContainerStatus actionRequired bool blockedOn *apicontainer.DependsOn - reason error + reason *dependencygraph.DependencyError } // resourceTransition defines the struct for a resource to transition. @@ -1003,6 +1003,17 @@ func (mtask *managedTask) progressTask() { blockedByOrderingDependencies := len(blockedDependencies) > 0 + logger.Info(">>>>>atLeastOneTransitionStarted", logger.Fields{ + "value": atLeastOneTransitionStarted, + }) + + for _, b := range blockedDependencies { + logger.Info(">>>>>blocked on ", logger.Fields{ + "container": b.ContainerName, + "condition": b.Condition, + }) + } + // If no transitions happened and we aren't blocked by ordering dependencies, then we are possibly in a state where // its impossible for containers to move forward. We will do an additional check to see if we are waiting for ACS // execution credentials. If not, then we will abort the task progression. @@ -1088,6 +1099,9 @@ func (mtask *managedTask) startContainerTransitions(transitionFunc containerTran for _, cont := range mtask.Containers { transition := mtask.containerNextState(cont) if transition.reason != nil { + if transition.reason.IsTerminal() { + mtask.handleTerminalDependencyError(cont, transition.reason) + } // container can't be transitioned reasons = append(reasons, transition.reason) if transition.blockedOn != nil { @@ -1128,6 +1142,27 @@ func (mtask *managedTask) startContainerTransitions(transitionFunc containerTran return anyCanTransition, blocked, transitions, reasons } +func (mtask *managedTask) handleTerminalDependencyError(container *apicontainer.Container, error *dependencygraph.DependencyError) { + container.SetDesiredStatus(apicontainerstatus.ContainerStopped) + exitCode := 143 + container.SetKnownExitCode(&exitCode) + // Change container status to STOPPED with exit code 143. This exit code is what docker reports when + // a container receives SIGTERM. In this case it's technically not true that we send SIGTERM because the + // container didn't even start, but we have to report an error and 143 seems the most appropriate. + go func(cont *apicontainer.Container) { + mtask.dockerMessages <- dockerContainerChange{ + container: cont, + event: dockerapi.DockerContainerChangeEvent{ + Status: apicontainerstatus.ContainerStopped, + DockerContainerMetadata: dockerapi.DockerContainerMetadata{ + Error: dockerapi.CannotStartContainerError{FromError: error}, + ExitCode: &exitCode, + }, + }, + } + }(container) +} + // startResourceTransitions steps through each resource in the task and calls // the passed transition function when a transition should occur func (mtask *managedTask) startResourceTransitions(transitionFunc resourceTransitionFunc) (bool, map[string]string) { @@ -1370,10 +1405,6 @@ func (mtask *managedTask) resourceNextState(resource taskresource.TaskResource) } func (mtask *managedTask) handleContainersUnableToTransitionState() { - logger.Critical("Task in a bad state; it's not steady state but no containers want to transition", logger.Fields{ - field.TaskID: mtask.GetID(), - }) - if mtask.GetDesiredStatus().Terminal() { // Ack, really bad. We want it to stop but the containers don't think // that's possible. let's just break out and hope for the best! @@ -1384,10 +1415,23 @@ func (mtask *managedTask) handleContainersUnableToTransitionState() { mtask.emitTaskEvent(mtask.Task, taskUnableToTransitionToStoppedReason) // TODO we should probably panic here } else { - logger.Critical("Moving task to stopped due to bad state", logger.Fields{ - field.TaskID: mtask.GetID(), - }) - mtask.handleDesiredStatusChange(apitaskstatus.TaskStopped, 0) + // If we end up here, it means containers are not able to transition anymore; maybe because of dependencies that + // are unable to start. Therefore, if there are essential containers that haven't started yet, we need to + // stop the task since they are not going to start anymore. + stopTask := false + for _, c := range mtask.Containers { + if c.IsEssential() && !c.IsKnownSteadyState() { + stopTask = true + break + } + } + + if stopTask { + logger.Critical("Task in a bad state; it's not steady state but no containers want to transition", logger.Fields{ + field.TaskID: mtask.GetID(), + }) + mtask.handleDesiredStatusChange(apitaskstatus.TaskStopped, 0) + } } } From 1836472853db4957d7cee0a32dcebb142dea69f5 Mon Sep 17 00:00:00 2001 From: Angel Velazquez Date: Thu, 19 May 2022 15:17:35 -0700 Subject: [PATCH 2/6] Fix ordering test cases --- agent/engine/dependencygraph/graph.go | 45 +++++++++++++++------------ agent/engine/task_manager.go | 12 +++++-- agent/engine/task_manager_test.go | 6 ++-- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/agent/engine/dependencygraph/graph.go b/agent/engine/dependencygraph/graph.go index 73276068170..7ef8d64f1ea 100644 --- a/agent/engine/dependencygraph/graph.go +++ b/agent/engine/dependencygraph/graph.go @@ -47,37 +47,42 @@ const ( var ( // CredentialsNotResolvedErr is the error where a container needs to wait for // credentials before it can process by agent - CredentialsNotResolvedErr = &DependencyError{err: errors.New("dependency graph: container execution credentials not available")} + CredentialsNotResolvedErr = &dependencyError{err: errors.New("dependency graph: container execution credentials not available")} // DependentContainerNotResolvedErr is the error where a dependent container isn't in expected state - DependentContainerNotResolvedErr = &DependencyError{err: errors.New("dependency graph: dependent container not in expected state")} + DependentContainerNotResolvedErr = &dependencyError{err: errors.New("dependency graph: dependent container not in expected state")} // ContainerPastDesiredStatusErr is the error where the container status is bigger than desired status - ContainerPastDesiredStatusErr = &DependencyError{err: errors.New("container transition: container status is equal or greater than desired status")} + ContainerPastDesiredStatusErr = &dependencyError{err: errors.New("container transition: container status is equal or greater than desired status")} // ErrContainerDependencyNotResolved is when the container's dependencies // on other containers are not resolved - ErrContainerDependencyNotResolved = &DependencyError{err: errors.New("dependency graph: dependency on containers not resolved")} + ErrContainerDependencyNotResolved = &dependencyError{err: errors.New("dependency graph: dependency on containers not resolved")} // ErrResourceDependencyNotResolved is when the container's dependencies // on task resources are not resolved - ErrResourceDependencyNotResolved = &DependencyError{err: errors.New("dependency graph: dependency on resources not resolved")} + ErrResourceDependencyNotResolved = &dependencyError{err: errors.New("dependency graph: dependency on resources not resolved")} // ResourcePastDesiredStatusErr is the error where the task resource known status is bigger than desired status - ResourcePastDesiredStatusErr = &DependencyError{err: errors.New("task resource transition: task resource status is equal or greater than desired status")} + ResourcePastDesiredStatusErr = &dependencyError{err: errors.New("task resource transition: task resource status is equal or greater than desired status")} // ErrContainerDependencyNotResolvedForResource is when the resource's dependencies // on other containers are not resolved - ErrContainerDependencyNotResolvedForResource = &DependencyError{err: errors.New("dependency graph: resource's dependency on containers not resolved")} + ErrContainerDependencyNotResolvedForResource = &dependencyError{err: errors.New("dependency graph: resource's dependency on containers not resolved")} ) // DependencyError represents an error of a container dependency. These errors can be either terminal or non-terminal. // Terminal dependency errors indicate that a given dependency can never be fulfilled (e.g. a container with a SUCCESS // dependency has stopped with an exit code other than zero). -type DependencyError struct { +type DependencyError interface { + Error() string + IsTerminal() bool +} + +type dependencyError struct { err error isTerminal bool } -func (de *DependencyError) Error() string { +func (de *dependencyError) Error() string { return de.err.Error() } -func (de *DependencyError) IsTerminal() bool { +func (de *dependencyError) IsTerminal() bool { return de.isTerminal } @@ -140,7 +145,7 @@ func DependenciesAreResolved(target *apicontainer.Container, id string, manager credentials.Manager, resources []taskresource.TaskResource, - cfg *config.Config) (*apicontainer.DependsOn, *DependencyError) { + cfg *config.Config) (*apicontainer.DependsOn, DependencyError) { if !executionCredentialsResolved(target, id, manager) { return nil, CredentialsNotResolvedErr } @@ -260,7 +265,7 @@ func verifyStatusResolvable(target *apicontainer.Container, existingContainers m // (map from name to container). The `resolves` function passed should return true if the named container is resolved. func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container, - cfg *config.Config, resolves func(*apicontainer.Container, *apicontainer.Container, string, *config.Config) bool) (*apicontainer.DependsOn, *DependencyError) { + cfg *config.Config, resolves func(*apicontainer.Container, *apicontainer.Container, string, *config.Config) bool) (*apicontainer.DependsOn, DependencyError) { targetGoal := target.GetDesiredStatus() targetKnown := target.GetKnownStatus() @@ -276,7 +281,7 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi for _, dependency := range targetDependencies { dependencyContainer, ok := existingContainers[dependency.ContainerName] if !ok { - return nil, &DependencyError{err: fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] does not exist.", dependencyContainer, target), isTerminal: true} + return nil, &dependencyError{err: fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] does not exist.", dependencyContainer, target), isTerminal: true} } // We want to check whether the dependency container has timed out only if target has not been created yet. @@ -284,7 +289,7 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi // However, if dependency container has already stopped, then it cannot time out. if targetKnown < apicontainerstatus.ContainerCreated && dependencyContainer.GetKnownStatus() != apicontainerstatus.ContainerStopped { if hasDependencyTimedOut(dependencyContainer, dependency.Condition) { - return nil, &DependencyError{err: fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] has timed out.", dependencyContainer, target), isTerminal: true} + return nil, &dependencyError{err: fmt.Errorf("dependency graph: container ordering dependency [%v] for target [%v] has timed out.", dependencyContainer, target), isTerminal: true} } } @@ -292,13 +297,13 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi // can then never progress to its desired state when the dependency condition is 'SUCCESS' if dependency.Condition == successCondition && dependencyContainer.GetKnownStatus() == apicontainerstatus.ContainerStopped && !hasDependencyStoppedSuccessfully(dependencyContainer) { - return nil, &DependencyError{err: fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] as dependency did not exit successfully.", dependencyContainer, target), isTerminal: true} + return nil, &dependencyError{err: fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] as dependency did not exit successfully.", dependencyContainer, target), isTerminal: true} } // For any of the dependency conditions - START/COMPLETE/SUCCESS/HEALTHY, if the dependency container has // not started and will not start in the future, this dependency can never be resolved. if dependencyContainer.HasNotAndWillNotStart() { - return nil, &DependencyError{err: fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] because dependency will never start", dependencyContainer, target), isTerminal: true} + return nil, &dependencyError{err: fmt.Errorf("dependency graph: failed to resolve container ordering dependency [%v] for target [%v] because dependency will never start", dependencyContainer, target), isTerminal: true} } if !resolves(target, dependencyContainer, dependency.Condition, cfg) { @@ -306,14 +311,14 @@ func verifyContainerOrderingStatusResolvable(target *apicontainer.Container, exi } } if blockedDependency != nil { - return blockedDependency, &DependencyError{err: fmt.Errorf("dependency graph: failed to resolve the container ordering dependency [%v] for target [%v]", blockedDependency, target)} + return blockedDependency, &dependencyError{err: fmt.Errorf("dependency graph: failed to resolve the container ordering dependency [%v] for target [%v]", blockedDependency, target)} } return nil, nil } func verifyTransitionDependenciesResolved(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container, - existingResources map[string]taskresource.TaskResource) *DependencyError { + existingResources map[string]taskresource.TaskResource) DependencyError { if !verifyContainerDependenciesResolved(target, existingContainers) { return ErrContainerDependencyNotResolved @@ -487,7 +492,7 @@ func verifyContainerOrderingStatus(dependsOnContainer *apicontainer.Container) b dependsOnContainerDesiredStatus == dependsOnContainer.GetSteadyStateStatus() } -func verifyShutdownOrder(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container) *DependencyError { +func verifyShutdownOrder(target *apicontainer.Container, existingContainers map[string]*apicontainer.Container) DependencyError { // We considered adding this to the task state, but this will be at most 45 loops, // so we err'd on the side of having less state. missingShutdownDependencies := []string{} @@ -509,7 +514,7 @@ func verifyShutdownOrder(target *apicontainer.Container, existingContainers map[ return nil } - return &DependencyError{err: fmt.Errorf("dependency graph: target %s needs other containers stopped before it can stop: [%s]", + return &dependencyError{err: fmt.Errorf("dependency graph: target %s needs other containers stopped before it can stop: [%s]", target.Name, strings.Join(missingShutdownDependencies, "], ["))} } diff --git a/agent/engine/task_manager.go b/agent/engine/task_manager.go index 55e781f6b01..7030caf5911 100644 --- a/agent/engine/task_manager.go +++ b/agent/engine/task_manager.go @@ -87,7 +87,7 @@ type containerTransition struct { nextState apicontainerstatus.ContainerStatus actionRequired bool blockedOn *apicontainer.DependsOn - reason *dependencygraph.DependencyError + reason dependencygraph.DependencyError } // resourceTransition defines the struct for a resource to transition. @@ -1142,7 +1142,7 @@ func (mtask *managedTask) startContainerTransitions(transitionFunc containerTran return anyCanTransition, blocked, transitions, reasons } -func (mtask *managedTask) handleTerminalDependencyError(container *apicontainer.Container, error *dependencygraph.DependencyError) { +func (mtask *managedTask) handleTerminalDependencyError(container *apicontainer.Container, error dependencygraph.DependencyError) { container.SetDesiredStatus(apicontainerstatus.ContainerStopped) exitCode := 143 container.SetKnownExitCode(&exitCode) @@ -1419,11 +1419,19 @@ func (mtask *managedTask) handleContainersUnableToTransitionState() { // are unable to start. Therefore, if there are essential containers that haven't started yet, we need to // stop the task since they are not going to start anymore. stopTask := false + containersRunning := 0 for _, c := range mtask.Containers { if c.IsEssential() && !c.IsKnownSteadyState() { stopTask = true break } + if c.IsKnownSteadyState() { + containersRunning++ + } + } + + if containersRunning == 0 { + stopTask = true } if stopTask { diff --git a/agent/engine/task_manager_test.go b/agent/engine/task_manager_test.go index 1ab79ddfcc2..9c92377da49 100644 --- a/agent/engine/task_manager_test.go +++ b/agent/engine/task_manager_test.go @@ -240,7 +240,7 @@ func TestContainerNextState(t *testing.T) { containerDesiredStatus apicontainerstatus.ContainerStatus expectedContainerStatus apicontainerstatus.ContainerStatus expectedTransitionActionable bool - reason error + reason dependencygraph.DependencyError }{ // NONE -> RUNNING transition is allowed and actionable, when desired is Running // The expected next status is Pulled @@ -928,8 +928,8 @@ func TestOnContainersUnableToTransitionStateForDesiredRunningTask(t *testing.T) } task.handleContainersUnableToTransitionState() - assert.Equal(t, task.GetDesiredStatus(), apitaskstatus.TaskStopped) - assert.Equal(t, task.Containers[0].GetDesiredStatus(), apicontainerstatus.ContainerStopped) + assert.Equal(t, apitaskstatus.TaskStopped, task.GetDesiredStatus()) + assert.Equal(t, apicontainerstatus.ContainerStopped, task.Containers[0].GetDesiredStatus()) } // TODO: Test progressContainers workflow From 83d8ed35ee94d359c903fd519532beaf79fdd18b Mon Sep 17 00:00:00 2001 From: Angel Velazquez Date: Thu, 19 May 2022 15:22:56 -0700 Subject: [PATCH 3/6] Logging terminal error --- agent/engine/task_manager.go | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/agent/engine/task_manager.go b/agent/engine/task_manager.go index 7030caf5911..cb517ed4913 100644 --- a/agent/engine/task_manager.go +++ b/agent/engine/task_manager.go @@ -1003,17 +1003,6 @@ func (mtask *managedTask) progressTask() { blockedByOrderingDependencies := len(blockedDependencies) > 0 - logger.Info(">>>>>atLeastOneTransitionStarted", logger.Fields{ - "value": atLeastOneTransitionStarted, - }) - - for _, b := range blockedDependencies { - logger.Info(">>>>>blocked on ", logger.Fields{ - "container": b.ContainerName, - "condition": b.Condition, - }) - } - // If no transitions happened and we aren't blocked by ordering dependencies, then we are possibly in a state where // its impossible for containers to move forward. We will do an additional check to see if we are waiting for ACS // execution credentials. If not, then we will abort the task progression. @@ -1143,6 +1132,10 @@ func (mtask *managedTask) startContainerTransitions(transitionFunc containerTran } func (mtask *managedTask) handleTerminalDependencyError(container *apicontainer.Container, error dependencygraph.DependencyError) { + logger.Error("Terminal error detected during transition; marking container as stopped", logger.Fields{ + field.Container: container.Name, + field.Error: error.Error(), + }) container.SetDesiredStatus(apicontainerstatus.ContainerStopped) exitCode := 143 container.SetKnownExitCode(&exitCode) From 0268dec2706328b81d3b6fbd75c2d1ab368e50c9 Mon Sep 17 00:00:00 2001 From: Angel Velazquez Date: Thu, 19 May 2022 15:42:18 -0700 Subject: [PATCH 4/6] Fix ordering test cases --- agent/engine/task_manager.go | 10 +----- agent/engine/task_manager_test.go | 58 +++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/agent/engine/task_manager.go b/agent/engine/task_manager.go index cb517ed4913..9a4f32581a3 100644 --- a/agent/engine/task_manager.go +++ b/agent/engine/task_manager.go @@ -1410,21 +1410,13 @@ func (mtask *managedTask) handleContainersUnableToTransitionState() { } else { // If we end up here, it means containers are not able to transition anymore; maybe because of dependencies that // are unable to start. Therefore, if there are essential containers that haven't started yet, we need to - // stop the task since they are not going to start anymore. + // stop the task since they are not going to start. stopTask := false - containersRunning := 0 for _, c := range mtask.Containers { if c.IsEssential() && !c.IsKnownSteadyState() { stopTask = true break } - if c.IsKnownSteadyState() { - containersRunning++ - } - } - - if containersRunning == 0 { - stopTask = true } if stopTask { diff --git a/agent/engine/task_manager_test.go b/agent/engine/task_manager_test.go index 9c92377da49..f5986f6a6f1 100644 --- a/agent/engine/task_manager_test.go +++ b/agent/engine/task_manager_test.go @@ -908,28 +908,48 @@ func TestOnContainersUnableToTransitionStateForDesiredStoppedTask(t *testing.T) } func TestOnContainersUnableToTransitionStateForDesiredRunningTask(t *testing.T) { - firstContainerName := "container1" - firstContainer := &apicontainer.Container{ - KnownStatusUnsafe: apicontainerstatus.ContainerCreated, - DesiredStatusUnsafe: apicontainerstatus.ContainerRunning, - Name: firstContainerName, - } - task := &managedTask{ - Task: &apitask.Task{ - Containers: []*apicontainer.Container{ - firstContainer, - }, - DesiredStatusUnsafe: apitaskstatus.TaskRunning, + for _, tc := range []struct { + knownStatus apicontainerstatus.ContainerStatus + expectedContainerDesiredStatus apicontainerstatus.ContainerStatus + expectedTaskDesiredStatus apitaskstatus.TaskStatus + }{ + { + knownStatus: apicontainerstatus.ContainerCreated, + expectedContainerDesiredStatus: apicontainerstatus.ContainerStopped, + expectedTaskDesiredStatus: apitaskstatus.TaskStopped, }, - engine: &DockerTaskEngine{ - dataClient: data.NewNoopClient(), + { + knownStatus: apicontainerstatus.ContainerRunning, + expectedContainerDesiredStatus: apicontainerstatus.ContainerRunning, + expectedTaskDesiredStatus: apitaskstatus.TaskRunning, }, - ctx: context.TODO(), - } + } { + t.Run(fmt.Sprintf("Essential container with knownStatus=%s", tc.knownStatus.String()), func(t *testing.T) { + firstContainerName := "container1" + firstContainer := &apicontainer.Container{ + KnownStatusUnsafe: tc.knownStatus, + DesiredStatusUnsafe: apicontainerstatus.ContainerRunning, + Name: firstContainerName, + Essential: true, // setting this to true since at least one container in the task must be essential. + } + task := &managedTask{ + Task: &apitask.Task{ + Containers: []*apicontainer.Container{ + firstContainer, + }, + DesiredStatusUnsafe: apitaskstatus.TaskRunning, + }, + engine: &DockerTaskEngine{ + dataClient: data.NewNoopClient(), + }, + ctx: context.TODO(), + } - task.handleContainersUnableToTransitionState() - assert.Equal(t, apitaskstatus.TaskStopped, task.GetDesiredStatus()) - assert.Equal(t, apicontainerstatus.ContainerStopped, task.Containers[0].GetDesiredStatus()) + task.handleContainersUnableToTransitionState() + assert.Equal(t, tc.expectedTaskDesiredStatus, task.GetDesiredStatus()) + assert.Equal(t, tc.expectedContainerDesiredStatus, task.Containers[0].GetDesiredStatus()) + }) + } } // TODO: Test progressContainers workflow From 01cbc0357c4b5b1a6eb4095624958f51c66c86bf Mon Sep 17 00:00:00 2001 From: Angel Velazquez Date: Thu, 19 May 2022 16:12:43 -0700 Subject: [PATCH 5/6] Fix ordering test cases --- agent/engine/task_manager_test.go | 81 +++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/agent/engine/task_manager_test.go b/agent/engine/task_manager_test.go index f5986f6a6f1..811f06f3fb2 100644 --- a/agent/engine/task_manager_test.go +++ b/agent/engine/task_manager_test.go @@ -19,6 +19,7 @@ import ( "context" "errors" "fmt" + "github.com/aws/aws-sdk-go/aws" "sync" "testing" "time" @@ -728,6 +729,86 @@ func TestStartContainerTransitionsWhenForwardTransitionIsNotPossible(t *testing. assert.Empty(t, transitions) } +func TestStartContainerTransitionsWithTerminalError(t *testing.T) { + firstContainerName := "container1" + firstContainer := &apicontainer.Container{ + KnownStatusUnsafe: apicontainerstatus.ContainerStopped, + DesiredStatusUnsafe: apicontainerstatus.ContainerStopped, + KnownExitCodeUnsafe: aws.Int(1), // This simulated the container has stopped unsuccessfully + Name: firstContainerName, + } + secondContainerName := "container2" + secondContainer := &apicontainer.Container{ + KnownStatusUnsafe: apicontainerstatus.ContainerCreated, + DesiredStatusUnsafe: apicontainerstatus.ContainerRunning, + Name: secondContainerName, + DependsOnUnsafe: []apicontainer.DependsOn{ + { + ContainerName: firstContainerName, + Condition: "SUCCESS", // This means this condition can never be fulfilled since container1 has exited with non-zero code + }, + }, + } + thirdContainerName := "container3" + thirdContainer := &apicontainer.Container{ + KnownStatusUnsafe: apicontainerstatus.ContainerCreated, + DesiredStatusUnsafe: apicontainerstatus.ContainerRunning, + Name: thirdContainerName, + DependsOnUnsafe: []apicontainer.DependsOn{ + { + ContainerName: secondContainerName, + Condition: "SUCCESS", // This means this condition can never be fulfilled since container2 has exited with non-zero code + }, + }, + } + dockerMessagesChan := make(chan dockerContainerChange) + task := &managedTask{ + Task: &apitask.Task{ + Containers: []*apicontainer.Container{ + firstContainer, + secondContainer, + thirdContainer, + }, + DesiredStatusUnsafe: apitaskstatus.TaskRunning, + }, + engine: &DockerTaskEngine{}, + dockerMessages: dockerMessagesChan, + } + + canTransition, _, transitions, errors := task.startContainerTransitions( + func(cont *apicontainer.Container, nextStatus apicontainerstatus.ContainerStatus) { + t.Error("Transition function should not be called when no transitions are possible") + }) + assert.False(t, canTransition) + assert.Empty(t, transitions) + assert.Equal(t, 3, len(errors)) // first error is just indicating container1 is at desired status, following errors should be terminal + assert.False(t, errors[0].(dependencygraph.DependencyError).IsTerminal(), "Error should NOT be terminal") + assert.True(t, errors[1].(dependencygraph.DependencyError).IsTerminal(), "Error should be terminal") + assert.True(t, errors[2].(dependencygraph.DependencyError).IsTerminal(), "Error should be terminal") + + stoppedMessages := make(map[string]dockerContainerChange) + // verify we are sending STOPPED message + for i := 0; i < 2; i++ { + select { + case msg := <-dockerMessagesChan: + stoppedMessages[msg.container.Name] = msg + case <-time.After(time.Second): + t.Fatal("Timed out waiting for docker messages") + break + } + } + + assert.Equal(t, secondContainer, stoppedMessages[secondContainerName].container) + assert.Equal(t, apicontainerstatus.ContainerStopped, stoppedMessages[secondContainerName].event.Status) + assert.Error(t, stoppedMessages[secondContainerName].event.DockerContainerMetadata.Error) + assert.Equal(t, 143, *stoppedMessages[secondContainerName].event.DockerContainerMetadata.ExitCode) + + assert.Equal(t, thirdContainer, stoppedMessages[thirdContainerName].container) + assert.Equal(t, apicontainerstatus.ContainerStopped, stoppedMessages[thirdContainerName].event.Status) + assert.Error(t, stoppedMessages[thirdContainerName].event.DockerContainerMetadata.Error) + assert.Equal(t, 143, *stoppedMessages[thirdContainerName].event.DockerContainerMetadata.ExitCode) +} + func TestStartContainerTransitionsInvokesHandleContainerChange(t *testing.T) { eventStreamName := "TESTTASKENGINE" From 1fe541f00191aa8d2639e8bedcffa9bcf72a33f5 Mon Sep 17 00:00:00 2001 From: Angel Velazquez Date: Thu, 19 May 2022 16:20:18 -0700 Subject: [PATCH 6/6] fix format --- agent/engine/task_manager_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agent/engine/task_manager_test.go b/agent/engine/task_manager_test.go index 811f06f3fb2..7f949e9c62a 100644 --- a/agent/engine/task_manager_test.go +++ b/agent/engine/task_manager_test.go @@ -19,11 +19,12 @@ import ( "context" "errors" "fmt" - "github.com/aws/aws-sdk-go/aws" "sync" "testing" "time" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/amazon-ecs-agent/agent/dockerclient/dockerapi" "github.com/aws/amazon-ecs-agent/agent/taskresource" mock_taskresource "github.com/aws/amazon-ecs-agent/agent/taskresource/mocks"