diff --git a/CHANGELOG.md b/CHANGELOG.md index 147f2a3bd0f..42291e3f9d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ [#1014](https://github.com/aws/amazon-ecs-agent/pull/1014) * Enhancement - Support `init` process in containers by adding support for Docker remote API client version 1.25 [#996](https://github.com/aws/amazon-ecs-agent/pull/996) +* Bug - Fixed a bug where tasks that fail to pull containers can cause the + agent to fail to restore properly after a restart. + [#1033](https://github.com/aws/amazon-ecs-agent/pull/1033) ## 1.14.5 * Enhancement - Retry failed container image pull operations [#975](https://github.com/aws/amazon-ecs-agent/pull/975) diff --git a/agent/engine/dockerstate/docker_task_engine_state.go b/agent/engine/dockerstate/docker_task_engine_state.go index 69e40950426..648a360a815 100644 --- a/agent/engine/dockerstate/docker_task_engine_state.go +++ b/agent/engine/dockerstate/docker_task_engine_state.go @@ -318,19 +318,7 @@ func (state *DockerTaskEngineState) AddContainer(container *api.DockerContainer, state.tasks[task.Arn] = task } - if container.DockerID != "" { - // Update the container id to the state - state.idToContainer[container.DockerID] = container - state.idToTask[container.DockerID] = task.Arn - - // Remove the previously added name mapping - delete(state.idToContainer, container.DockerName) - delete(state.idToTask, container.DockerName) - } else if container.DockerName != "" { - // Update the container name mapping to the state when the ID isn't available - state.idToContainer[container.DockerName] = container - state.idToTask[container.DockerName] = task.Arn - } + state.storeIDToContainerTaskUnsafe(container, task) existingMap, exists := state.taskToID[task.Arn] if !exists { @@ -364,19 +352,54 @@ func (state *DockerTaskEngineState) RemoveTask(task *api.Task) { task, ok := state.tasks[task.Arn] if !ok { + seelog.Warnf("Failed to locate task %s for removal from state", task.Arn) return } delete(state.tasks, task.Arn) + containerMap, ok := state.taskToID[task.Arn] if !ok { + seelog.Warnf("Failed to locate containerMap for task %s for removal from state", task.Arn) return } delete(state.taskToID, task.Arn) for _, dockerContainer := range containerMap { - delete(state.idToTask, dockerContainer.DockerID) - delete(state.idToContainer, dockerContainer.DockerID) + state.removeIDToContainerTaskUnsafe(dockerContainer) + } +} + +// storeIDToContainerTaskUnsafe stores the container in the idToContainer and idToTask maps. The key to the maps is +// either the Docker-generated ID or the agent-generated name (if the ID is not available). If the container is updated +// with an ID, a subsequent call to this function will update the map to use the ID as the key. +func (state *DockerTaskEngineState) storeIDToContainerTaskUnsafe(container *api.DockerContainer, task *api.Task) { + if container.DockerID != "" { + // Update the container id to the state + state.idToContainer[container.DockerID] = container + state.idToTask[container.DockerID] = task.Arn + + // Remove the previously added name mapping + delete(state.idToContainer, container.DockerName) + delete(state.idToTask, container.DockerName) + } else if container.DockerName != "" { + // Update the container name mapping to the state when the ID isn't available + state.idToContainer[container.DockerName] = container + state.idToTask[container.DockerName] = task.Arn + } +} + +// removeIDToContainerTaskUnsafe removes the container from the idToContainer and idToTask maps. They key to the maps +// is either the Docker-generated ID or the agent-generated name (if the ID is not available). This function assumes +// that the ID takes precedence and will delete by the ID when the ID is available. +func (state *DockerTaskEngineState) removeIDToContainerTaskUnsafe(container *api.DockerContainer) { + // The key to these maps is either the Docker ID or agent-generated name. We use the agent-generated name + // before a Docker ID is available. + key := container.DockerID + if key == "" { + key = container.DockerName } + delete(state.idToTask, key) + delete(state.idToContainer, key) } // RemoveImageState removes an image.ImageState diff --git a/agent/engine/dockerstate/dockerstate_test.go b/agent/engine/dockerstate/dockerstate_test.go index 38ed13c2d71..feb45dd6c6d 100644 --- a/agent/engine/dockerstate/dockerstate_test.go +++ b/agent/engine/dockerstate/dockerstate_test.go @@ -171,32 +171,42 @@ func TestTwophaseAddContainer(t *testing.T) { func TestRemoveTask(t *testing.T) { state := NewTaskEngineState() - testContainer := &api.Container{ + testContainer1 := &api.Container{ Name: "c1", } - testDockerContainer := &api.DockerContainer{ + testDockerContainer1 := &api.DockerContainer{ DockerID: "did", - Container: testContainer, + Container: testContainer1, + } + testContainer2 := &api.Container{ + Name: "c2", + } + testDockerContainer2 := &api.DockerContainer{ + // DockerName is used before the DockerID is assigned + DockerName: "docker-name-2", + Container: testContainer2, } testTask := &api.Task{ Arn: "t1", - Containers: []*api.Container{testContainer}, + Containers: []*api.Container{testContainer1, testContainer2}, } state.AddTask(testTask) - state.AddContainer(testDockerContainer, testTask) + state.AddContainer(testDockerContainer1, testTask) + state.AddContainer(testDockerContainer2, testTask) - tasks := state.AllTasks() - if len(tasks) != 1 { - t.Error("Expected only 1 task") - } + engineState := state.(*DockerTaskEngineState) + + assert.Len(t, state.AllTasks(), 1, "Expected one task") + assert.Len(t, engineState.idToTask, 2, "idToTask map should have two entries") + assert.Len(t, engineState.idToContainer, 2, "idToContainer map should have two entries") state.RemoveTask(testTask) - tasks = state.AllTasks() - if len(tasks) != 0 { - t.Error("Expected task to be removed") - } + assert.Len(t, state.AllTasks(), 0, "Expected task to be removed") + assert.Len(t, engineState.idToTask, 0, "idToTask map should be empty") + assert.Len(t, engineState.idToContainer, 0, "idToContainer map should be empty") + } func TestAddImageState(t *testing.T) { diff --git a/agent/engine/dockerstate/testutils/json_test.go b/agent/engine/dockerstate/testutils/json_test.go index 5ce925556b4..6db0a098eb9 100644 --- a/agent/engine/dockerstate/testutils/json_test.go +++ b/agent/engine/dockerstate/testutils/json_test.go @@ -22,6 +22,7 @@ import ( "github.com/aws/amazon-ecs-agent/agent/api" "github.com/aws/amazon-ecs-agent/agent/engine/dockerstate" + "github.com/stretchr/testify/assert" ) func createTestContainer(num int) *api.Container { @@ -50,14 +51,12 @@ func createTestTask(arn string, numContainers int) *api.Task { func decodeEqual(t *testing.T, state dockerstate.TaskEngineState) dockerstate.TaskEngineState { data, err := json.Marshal(&state) - if err != nil { - t.Error(err) - } + assert.NoError(t, err, "marshal state") + otherState := dockerstate.NewTaskEngineState() err = json.Unmarshal(data, &otherState) - if err != nil { - t.Error(err) - } + assert.NoError(t, err, "unmarshal state") + if !DockerStatesEqual(state, otherState) { debug.PrintStack() t.Error("States were not equal") @@ -77,8 +76,5 @@ func TestJsonEncoding(t *testing.T) { } other := decodeEqual(t, testState) _, ok := other.ContainerMapByArn("test1") - if !ok { - t.Error("Could not retrieve expected task") - } - + assert.True(t, ok, "could not retrieve expected task") } diff --git a/agent/functional_tests/README.md b/agent/functional_tests/README.md index 1dd8ca369e6..e985ef19363 100644 --- a/agent/functional_tests/README.md +++ b/agent/functional_tests/README.md @@ -14,49 +14,85 @@ It is not recommended to run these without understanding the implications of doing so and it is certainly not recommended to run them on an AWS account handling production work-loads. -## Setup +## Test setup -Before running these tests, you should build the ECS Agent and tag its image as +### Linux + +Before running these tests, you should build the ECS agent and tag its image as `amazon/amazon-ecs-agent:make`. You should also run the registry the tests pull from. This can most easily be done via `make test-registry`. +#### Environment variables +In order to run telemetry functional test in non Amazon Linux AMI environment +with older versions of the ECS agent (pre-1.10.0), the following environment +variables should be set: +* `CGROUP_PATH`: cgroup path on the host, default value "/cgroup" +* `EXECDRIVER_PATH`: execdriver path on the host, default value "/var/run/docker/execdriver" + +You can configure the following environment variables to change test +execution behavior: +* `AWS_REGION`: Control the region that is used for test execution +* `ECS_CLUSTER`: Control the cluster used for test execution +* `ECS_AGENT_IMAGE`: Override the default image of + `amazon/amazon-ecs-agent:make` +* `ECS_FTEST_TMP`: Override the default temporary directory used for storing + test logs and data files +* `ECS_FTEST_AGENT_ARGS`: Pass additional command-line arguments to the agent +* `ECS_FTEST_FORCE_NET_HOST`: Run the agent with `--net=host` + +#### Additional setup for IAM roles +In order to run TaskIamRole functional tests, the following steps should b +done first: +* Run command: `sysctl -w net.ipv4.conf.all.route_localnet=1` and + `iptables -t nat -A PREROUTING -p tcp -d 169.254.170.2 --dport 80 -j DNAT --to-destination 127.0.0.1:51679`. +* Set the environment variable to enable the test under default network mode: + `export TEST_TASK_IAM_ROLE=true`. +* Set the environment variable of IAM roles the test will use: + `export TASK_IAM_ROLE_ARN="iam role arn"`. The role should have the + `ec2:DescribeRegions` permission and have a trust relationship with + "ecs-tasks.amazonaws.com". If the `TASK_IAM_ROLE_ARN` environment variable + isn't set, the test will use the IAM role attached to the instance profile. + In this case, the IAM role should additionally have the + `iam:GetInstanceProfile` permission. +* Testing under net=host network mode requires additional commands: + `iptables -t nat -A OUTPUT -d 169.254.170.2 -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 51679` + and `export TEST_TASK_IAM_ROLE_NET_HOST=true` + +### Windows + +Before running these tests, you should build the ECS agent (as `agent.exe`) and +record the directory where the binary is present in the `ECS_WINDOWS_TEST_DIR` +environment variable. + +#### Environment variables +You can configure the following environment variables to change test +execution behavior: +* `AWS_REGION`: Control the region that is used for test execution +* `ECS_CLUSTER`: Control the cluster used for test execution +* `ECS_WINDOWS_TEST_DIR`: Override the path used to find `agent.exe` +* `ECS_FTEST_TMP`: Override the default temporary directory used for storing + test logs and data files + +#### Additional setup for IAM roles + +For performing the IAM roles test, perform the following additional tasks. +* `$env:TEST_TASK_IAM_ROLE="true"` +* devcon.exe should be present in system environment variable `PATH`. + + ## Running These tests should be run on an EC2 instance able to correctly run the ECS agent and access the ECS APIs. -The best way to run them is via the `make run-functional-tests` target. - -Thay may also be manually run with `go test -tags functional -v ./...`, - -### Envrionment Variable -In order to run Telemetry functional test in non Amazon Linux AMI environment, -the following environment variables should be set: - * CGROUP_PATH: cgroup path on the host, default value "/cgroup" - * EXECDRIVER_PATH: execdriver path on the host, default value "/var/run/docker/execdriver" - -In order to run TaskIamRole functional test, the following steps should be done first: - * Run command: `sysctl -w net.ipv4.conf.all.route_localnet=1` and - `iptables -t nat -A PREROUTING -p tcp -d 169.254.170.2 --dport 80 -j DNAT --to-destination 127.0.0.1:51679`. - * Set the environment variable to enable the test under default network mode: `export TEST_TASK_IAM_ROLE=true`. - * Set the envrionment variable of IAM roles the test will use: `export TASK_IAM_ROLE_ARN="iam role arn"`, - the role should have the `ec2:DescribeRegions` permission and have the trust relationship with "ecs-tasks.amazonaws.com". - Or if the `TASK_IAM_ROLE_ARN` isn't set, it will use the IAM role attached to the instance profile. In this case, - except the permissions required before, the IAM role should also have `iam:GetInstanceProfile` permission. - * Testing under net=host network mode requires additional command: - `iptables -t nat -A OUTPUT -d 169.254.170.2 -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 51679` and - `export TEST_TASK_IAM_ROLE_NET_HOST=true` - -## Windows Setup - -Set the following environment variable. - * $env:ECS_WINDOWS_TEST_DIR=##Path where the agent binary is present. - * For performing the IAM roles test, perform the following additional tasks. - ** `$env:TEST_TASK_IAM_ROLE="true"` - ** `Set environment variable AWS_REGION. For example, `$env:AWS_REGION="us-east-1"` - ** devcon.exe should be present in system environment variable PATH. +The best way to run them on Linux is via the `make run-functional-tests` +target. -## Running +The best way to run them on Windows is via the `run-functional-tests.ps1` +script located in the `scripts` directory. -Running the command run-functional-tests.ps1 from the scripts directory +They may also be manually run with `go test -tags functional -v ./...` and +you can use standard `go test` flags to control which tests execute (for +example, you can use the `-run` flag to provide a regular expression of tests +to include). diff --git a/agent/functional_tests/tests/functionaltests_test.go b/agent/functional_tests/tests/functionaltests_test.go index 6e7b6e45009..f7d49e275cb 100644 --- a/agent/functional_tests/tests/functionaltests_test.go +++ b/agent/functional_tests/tests/functionaltests_test.go @@ -89,6 +89,43 @@ func TestSavedState(t *testing.T) { testTask.WaitStopped(1 * time.Minute) } +// TestSavedStateWithInvalidImageAndCleanup verifies that a task definition with an invalid image does not prevent the +// agnet from starting again after the task has been cleaned up. See +// https://github.com/aws/amazon-ecs-agent/issues/1024 for details. +func TestSavedStateWithInvalidImageAndCleanup(t *testing.T) { + // Set the task cleanup time to just over a minute. + os.Setenv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION", "70s") + agent := RunAgent(t, nil) + defer func() { + agent.Cleanup() + os.Unsetenv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION") + }() + + testTask, err := agent.StartTask(t, "invalid-image") + require.NoError(t, err, "failed to start task") + assert.NoError(t, testTask.ExpectErrorType("error", "CannotPullContainerError", 1*time.Minute)) + + resp, err := agent.CallTaskIntrospectionAPI(testTask) + assert.NoError(t, err, "should be able to introspect the task") + assert.NotNil(t, resp, "should receive a response") + assert.Equal(t, *testTask.TaskArn, resp.Arn, "arn should be equal") + + // wait two minutes for it to be cleaned up + fmt.Println("Sleeping...") + time.Sleep(2 * time.Minute) + + resp, err = agent.CallTaskIntrospectionAPI(testTask) + assert.NoError(t, err, "should be able to call introspection api") // is there a reason we don't 404? + assert.NotNil(t, resp, "should receive a response") // why? + assert.Equal(t, "", resp.Arn, "arn is blank") + + err = agent.StopAgent() + require.NoError(t, err, "failed to stop agent") + + err = agent.StartAgent() + require.NoError(t, err, "failed to start agent again") +} + // TestPortResourceContention verifies that running two tasks on the same port // in quick-succession does not result in the second one failing to run. It // verifies the 'seqnum' serialization stuff works. diff --git a/agent/functional_tests/util/utils.go b/agent/functional_tests/util/utils.go index 249eaa56021..90867537da8 100644 --- a/agent/functional_tests/util/utils.go +++ b/agent/functional_tests/util/utils.go @@ -394,6 +394,17 @@ func (agent *TestAgent) waitRunningViaIntrospection(task *TestTask) (bool, error } } +func (agent *TestAgent) CallTaskIntrospectionAPI(task *TestTask) (*handlers.TaskResponse, error) { + rawResponse, err := agent.callTaskIntrospectionApi(*task.TaskArn) + if err != nil { + return nil, err + } + + var taskResp handlers.TaskResponse + err = json.Unmarshal(*rawResponse, &taskResp) + return &taskResp, err +} + func (agent *TestAgent) callTaskIntrospectionApi(taskArn string) (*[]byte, error) { fullIntrospectionApiURL := agent.IntrospectionURL + "/v1/tasks" if taskArn != "" { diff --git a/agent/functional_tests/util/utils_unix.go b/agent/functional_tests/util/utils_unix.go index 8baec6df5c0..44d7097f796 100644 --- a/agent/functional_tests/util/utils_unix.go +++ b/agent/functional_tests/util/utils_unix.go @@ -166,8 +166,12 @@ func (agent *TestAgent) StartAgent() error { Links: agent.Options.ContainerLinks, } + if os.Getenv("ECS_FTEST_FORCE_NET_HOST") != "" { + hostConfig.NetworkMode = "host" + } + if agent.Options != nil { - // Override the default docker envrionment variable + // Override the default docker environment variable for key, value := range agent.Options.ExtraEnvironment { envVarExists := false for i, str := range dockerConfig.Env { @@ -207,12 +211,17 @@ func (agent *TestAgent) StartAgent() error { if err != nil { return errors.New("Could not inspect agent container: " + err.Error()) } - agent.IntrospectionURL = "http://localhost:" + containerMetadata.NetworkSettings.Ports["51678/tcp"][0].HostPort + if containerMetadata.HostConfig.NetworkMode == "host" { + agent.IntrospectionURL = "http://localhost:51678" + } else { + agent.IntrospectionURL = "http://localhost:" + containerMetadata.NetworkSettings.Ports["51678/tcp"][0].HostPort + } + return agent.verifyIntrospectionAPI() } // getBindMounts actually constructs volume binds for container's host config -// It also additionally checks for envrionment variables: +// It also additionally checks for environment variables: // * CGROUP_PATH: the cgroup path // * EXECDRIVER_PATH: the path of metrics func (agent *TestAgent) getBindMounts() []string {