aws · samuelkarp · Oct 25, 2017 · Oct 25, 2017 · Oct 25, 2017 · Oct 25, 2017
@@ -6,6 +6,9 @@
   [#1014](https://github.com/aws/amazon-ecs-agent/pull/1014)
 * Enhancement - Support `init` process in containers by adding support for Docker remote API client version 1.25
   [#996](https://github.com/aws/amazon-ecs-agent/pull/996)
+* Bug - Fixed a bug where tasks that fail to pull containers can cause the
+  agent to fail to restore properly after a restart.
+  [#1033](https://github.com/aws/amazon-ecs-agent/pull/1033)
 
 ## 1.14.5
 * Enhancement - Retry failed container image pull operations [#975](https://github.com/aws/amazon-ecs-agent/pull/975)

@@ -318,19 +318,7 @@ func (state *DockerTaskEngineState) AddContainer(container *api.DockerContainer,
 		state.tasks[task.Arn] = task
 	}
 
-	if container.DockerID != "" {
-		// Update the container id to the state
-		state.idToContainer[container.DockerID] = container
-		state.idToTask[container.DockerID] = task.Arn
-
-		// Remove the previously added name mapping
-		delete(state.idToContainer, container.DockerName)
-		delete(state.idToTask, container.DockerName)
-	} else if container.DockerName != "" {
-		// Update the container name mapping to the state when the ID isn't available
-		state.idToContainer[container.DockerName] = container
-		state.idToTask[container.DockerName] = task.Arn
-	}
+	state.storeIDToContainerTaskUnsafe(container, task)
 
 	existingMap, exists := state.taskToID[task.Arn]
 	if !exists {
@@ -364,19 +352,54 @@ func (state *DockerTaskEngineState) RemoveTask(task *api.Task) {
 
 	task, ok := state.tasks[task.Arn]
 	if !ok {
+		seelog.Warnf("Failed to locate task %s for removal from state", task.Arn)
 		return
 	}
 	delete(state.tasks, task.Arn)
+
 	containerMap, ok := state.taskToID[task.Arn]
 	if !ok {
+		seelog.Warnf("Failed to locate containerMap for task %s for removal from state", task.Arn)
 		return
 	}
 	delete(state.taskToID, task.Arn)
 
 	for _, dockerContainer := range containerMap {
-		delete(state.idToTask, dockerContainer.DockerID)
-		delete(state.idToContainer, dockerContainer.DockerID)
+		state.removeIDToContainerTaskUnsafe(dockerContainer)
+	}
+}
+
+// storeIDToContainerTaskUnsafe stores the container in the idToContainer and idToTask maps.  The key to the maps is
+// either the Docker-generated ID or the agent-generated name (if the ID is not available).  If the container is updated
+// with an ID, a subsequent call to this function will update the map to use the ID as the key.
+func (state *DockerTaskEngineState) storeIDToContainerTaskUnsafe(container *api.DockerContainer, task *api.Task) {
+	if container.DockerID != "" {
+		// Update the container id to the state
+		state.idToContainer[container.DockerID] = container
+		state.idToTask[container.DockerID] = task.Arn
+
+		// Remove the previously added name mapping
+		delete(state.idToContainer, container.DockerName)
+		delete(state.idToTask, container.DockerName)
+	} else if container.DockerName != "" {
+		// Update the container name mapping to the state when the ID isn't available
+		state.idToContainer[container.DockerName] = container
+		state.idToTask[container.DockerName] = task.Arn
+	}
+}
+
+// removeIDToContainerTaskUnsafe removes the container from the idToContainer and idToTask maps.  They key to the maps
+// is either the Docker-generated ID or the agent-generated name (if the ID is not available).  This function assumes
+// that the ID takes precedence and will delete by the ID when the ID is available.
+func (state *DockerTaskEngineState) removeIDToContainerTaskUnsafe(container *api.DockerContainer) {
+	// The key to these maps is either the Docker ID or agent-generated name.  We use the agent-generated name
+	// before a Docker ID is available.
+	key := container.DockerID
+	if key == "" {
+		key = container.DockerName
 	}
+	delete(state.idToTask, key)
+	delete(state.idToContainer, key)
 }
 
 // RemoveImageState removes an image.ImageState

@@ -171,32 +171,42 @@ func TestTwophaseAddContainer(t *testing.T) {
 
 func TestRemoveTask(t *testing.T) {
 	state := NewTaskEngineState()
-	testContainer := &api.Container{
+	testContainer1 := &api.Container{
 		Name: "c1",
 	}
-	testDockerContainer := &api.DockerContainer{
+	testDockerContainer1 := &api.DockerContainer{
 		DockerID:  "did",
-		Container: testContainer,
+		Container: testContainer1,
+	}
+	testContainer2 := &api.Container{
+		Name: "c2",
+	}
+	testDockerContainer2 := &api.DockerContainer{
+		// DockerName is used before the DockerID is assigned
+		DockerName: "docker-name-2",
+		Container:  testContainer2,
 	}
 	testTask := &api.Task{
 		Arn:        "t1",
-		Containers: []*api.Container{testContainer},
+		Containers: []*api.Container{testContainer1, testContainer2},
 	}
 
 	state.AddTask(testTask)
-	state.AddContainer(testDockerContainer, testTask)
+	state.AddContainer(testDockerContainer1, testTask)
+	state.AddContainer(testDockerContainer2, testTask)
 
-	tasks := state.AllTasks()
-	if len(tasks) != 1 {
-		t.Error("Expected only 1 task")
-	}
+	engineState := state.(*DockerTaskEngineState)
+
+	assert.Len(t, state.AllTasks(), 1, "Expected one task")
+	assert.Len(t, engineState.idToTask, 2, "idToTask map should have two entries")
+	assert.Len(t, engineState.idToContainer, 2, "idToContainer map should have two entries")
 
 	state.RemoveTask(testTask)
 
-	tasks = state.AllTasks()
-	if len(tasks) != 0 {
-		t.Error("Expected task to be removed")
-	}
+	assert.Len(t, state.AllTasks(), 0, "Expected task to be removed")
+	assert.Len(t, engineState.idToTask, 0, "idToTask map should be empty")
+	assert.Len(t, engineState.idToContainer, 0, "idToContainer map should be empty")
+
 }
 
 func TestAddImageState(t *testing.T) {

@@ -22,6 +22,7 @@ import (
 
 	"github.com/aws/amazon-ecs-agent/agent/api"
 	"github.com/aws/amazon-ecs-agent/agent/engine/dockerstate"
+	"github.com/stretchr/testify/assert"
 )
 
 func createTestContainer(num int) *api.Container {
@@ -50,14 +51,12 @@ func createTestTask(arn string, numContainers int) *api.Task {
 
 func decodeEqual(t *testing.T, state dockerstate.TaskEngineState) dockerstate.TaskEngineState {
 	data, err := json.Marshal(&state)
-	if err != nil {
-		t.Error(err)
-	}
+	assert.NoError(t, err, "marshal state")
+
 	otherState := dockerstate.NewTaskEngineState()
 	err = json.Unmarshal(data, &otherState)
-	if err != nil {
-		t.Error(err)
-	}
+	assert.NoError(t, err, "unmarshal state")
+
 	if !DockerStatesEqual(state, otherState) {
 		debug.PrintStack()
 		t.Error("States were not equal")
@@ -77,8 +76,5 @@ func TestJsonEncoding(t *testing.T) {
 	}
 	other := decodeEqual(t, testState)
 	_, ok := other.ContainerMapByArn("test1")
-	if !ok {
-		t.Error("Could not retrieve expected task")
-	}
-
+	assert.True(t, ok, "could not retrieve expected task")
 }
@@ -14,49 +14,85 @@ It is not recommended to run these without understanding the implications of
 doing so and it is certainly not recommended to run them on an AWS account
 handling production work-loads.
 
-## Setup
+## Test setup
 
-Before running these tests, you should build the ECS Agent and tag its image as
+### Linux
+
+Before running these tests, you should build the ECS agent and tag its image as
 `amazon/amazon-ecs-agent:make`.
 
 You should also run the registry the tests pull from. This can most easily be done via `make test-registry`.
 
+#### Environment variables
+In order to run telemetry functional test in non Amazon Linux AMI environment
+with older versions of the ECS agent (pre-1.10.0), the following environment
+variables should be set:
+* `CGROUP_PATH`: cgroup path on the host, default value "/cgroup"
+* `EXECDRIVER_PATH`: execdriver path on the host, default value "/var/run/docker/execdriver"
+
+You can configure the following environment variables to change test
+execution behavior:
+* `AWS_REGION`: Control the region that is used for test execution
+* `ECS_CLUSTER`: Control the cluster used for test execution
+* `ECS_AGENT_IMAGE`: Override the default image of
+  `amazon/amazon-ecs-agent:make`
+* `ECS_FTEST_TMP`: Override the default temporary directory used for storing
+  test logs and data files
+* `ECS_FTEST_AGENT_ARGS`: Pass additional command-line arguments to the agent
+* `ECS_FTEST_FORCE_NET_HOST`: Run the agent with `--net=host`
+
+#### Additional setup for IAM roles
+In order to run TaskIamRole functional tests, the following steps should b
+done first:
+* Run command: `sysctl -w net.ipv4.conf.all.route_localnet=1` and
+  `iptables -t nat -A PREROUTING -p tcp -d 169.254.170.2 --dport 80 -j DNAT --to-destination 127.0.0.1:51679`.
+* Set the environment variable to enable the test under default network mode:
+  `export TEST_TASK_IAM_ROLE=true`.
+* Set the environment variable of IAM roles the test will use:
+  `export TASK_IAM_ROLE_ARN="iam role arn"`. The role should have the
+  `ec2:DescribeRegions` permission and have a trust relationship with
+  "ecs-tasks.amazonaws.com". If the `TASK_IAM_ROLE_ARN` environment variable
+  isn't set, the test will use the IAM role attached to the instance profile.
+  In this case, the IAM role should additionally have the
+  `iam:GetInstanceProfile` permission.
+* Testing under net=host network mode requires additional commands:
+  `iptables -t nat -A OUTPUT -d 169.254.170.2 -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 51679`
+  and `export TEST_TASK_IAM_ROLE_NET_HOST=true`
+
+### Windows
+
+Before running these tests, you should build the ECS agent (as `agent.exe`) and
+record the directory where the binary is present in the `ECS_WINDOWS_TEST_DIR`
+environment variable.
+
+#### Environment variables
+You can configure the following environment variables to change test
+execution behavior:
+* `AWS_REGION`: Control the region that is used for test execution
+* `ECS_CLUSTER`: Control the cluster used for test execution
+* `ECS_WINDOWS_TEST_DIR`: Override the path used to find `agent.exe`
+* `ECS_FTEST_TMP`: Override the default temporary directory used for storing
+  test logs and data files
+
+#### Additional setup for IAM roles
+
+For performing the IAM roles test, perform the following additional tasks.
+* `$env:TEST_TASK_IAM_ROLE="true"`
+* devcon.exe should be present in system environment variable `PATH`.
+
+
 ## Running
 
 These tests should be run on an EC2 instance able to correctly run the ECS
 agent and access the ECS APIs.
 
-The best way to run them is via the `make run-functional-tests` target.
-
-Thay may also be manually run with `go test -tags functional -v ./...`,
-
-### Envrionment Variable
-In order to run Telemetry functional test in non Amazon Linux AMI environment,
-the following environment variables should be set:
-  * CGROUP_PATH: cgroup path on the host, default value "/cgroup"
-  * EXECDRIVER_PATH: execdriver path on the host, default value "/var/run/docker/execdriver"
-
-In order to run TaskIamRole functional test, the following steps should be done first:
-  * Run command: `sysctl -w net.ipv4.conf.all.route_localnet=1` and
-    `iptables -t nat -A PREROUTING -p tcp -d 169.254.170.2 --dport 80 -j DNAT --to-destination 127.0.0.1:51679`.
-  * Set the environment variable to enable the test under default network mode: `export TEST_TASK_IAM_ROLE=true`.
-  * Set the envrionment variable of IAM roles the test will use: `export TASK_IAM_ROLE_ARN="iam role arn"`,
-  the role should have the `ec2:DescribeRegions` permission and have the trust relationship with "ecs-tasks.amazonaws.com".
-  Or if the `TASK_IAM_ROLE_ARN` isn't set, it will use the IAM role attached to the instance profile. In this case,
-  except the permissions required before, the IAM role should also have `iam:GetInstanceProfile` permission.
-  * Testing under net=host network mode requires additional command:
-    `iptables -t nat -A OUTPUT -d 169.254.170.2 -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 51679` and
-    `export TEST_TASK_IAM_ROLE_NET_HOST=true`
-
-## Windows Setup
-
-Set the following environment variable.
-  * $env:ECS_WINDOWS_TEST_DIR=##Path where the agent binary is present.
-  * For performing the IAM roles test, perform the following additional tasks.
-    ** `$env:TEST_TASK_IAM_ROLE="true"`
-    ** `Set environment variable AWS_REGION. For example, `$env:AWS_REGION="us-east-1"`
-    ** devcon.exe should be present in system environment variable PATH.    
+The best way to run them on Linux is via the `make run-functional-tests`
+target.
 
-## Running
+The best way to run them on Windows is via the `run-functional-tests.ps1`
+script located in the `scripts` directory.
 
-Running the command run-functional-tests.ps1 from the scripts directory
+They may also be manually run with `go test -tags functional -v ./...` and
+you can use standard `go test` flags to control which tests execute (for
+example, you can use the `-run` flag to provide a regular expression of tests
+to include).
@@ -89,6 +89,43 @@ func TestSavedState(t *testing.T) {
 	testTask.WaitStopped(1 * time.Minute)
 }
 
+// TestSavedStateWithInvalidImageAndCleanup verifies that a task definition with an invalid image does not prevent the
+// agnet from starting again after the task has been cleaned up.  See
+// https://github.com/aws/amazon-ecs-agent/issues/1024 for details.
+func TestSavedStateWithInvalidImageAndCleanup(t *testing.T) {
+	// Set the task cleanup time to just over a minute.
+	os.Setenv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION", "70s")
+	agent := RunAgent(t, nil)
+	defer func() {
+		agent.Cleanup()
+		os.Unsetenv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION")
+	}()
+
+	testTask, err := agent.StartTask(t, "invalid-image")
+	require.NoError(t, err, "failed to start task")
+	assert.NoError(t, testTask.ExpectErrorType("error", "CannotPullContainerError", 1*time.Minute))
+
+	resp, err := agent.CallTaskIntrospectionAPI(testTask)
+	assert.NoError(t, err, "should be able to introspect the task")
+	assert.NotNil(t, resp, "should receive a response")
+	assert.Equal(t, *testTask.TaskArn, resp.Arn, "arn should be equal")
+
+	// wait two minutes for it to be cleaned up
+	fmt.Println("Sleeping...")
+	time.Sleep(2 * time.Minute)
+
+	resp, err = agent.CallTaskIntrospectionAPI(testTask)
+	assert.NoError(t, err, "should be able to call introspection api") // is there a reason we don't 404?
+	assert.NotNil(t, resp, "should receive a response")                // why?
+	assert.Equal(t, "", resp.Arn, "arn is blank")
+
+	err = agent.StopAgent()
+	require.NoError(t, err, "failed to stop agent")
+
+	err = agent.StartAgent()
+	require.NoError(t, err, "failed to start agent again")
+}
+
 // TestPortResourceContention verifies that running two tasks on the same port
 // in quick-succession does not result in the second one failing to run. It
 // verifies the 'seqnum' serialization stuff works.

@@ -394,6 +394,17 @@ func (agent *TestAgent) waitRunningViaIntrospection(task *TestTask) (bool, error
 	}
 }
 
+func (agent *TestAgent) CallTaskIntrospectionAPI(task *TestTask) (*handlers.TaskResponse, error) {
+	rawResponse, err := agent.callTaskIntrospectionApi(*task.TaskArn)
+	if err != nil {
+		return nil, err
+	}
+
+	var taskResp handlers.TaskResponse
+	err = json.Unmarshal(*rawResponse, &taskResp)
+	return &taskResp, err
+}
+
 func (agent *TestAgent) callTaskIntrospectionApi(taskArn string) (*[]byte, error) {
 	fullIntrospectionApiURL := agent.IntrospectionURL + "/v1/tasks"
 	if taskArn != "" {

@@ -166,8 +166,12 @@ func (agent *TestAgent) StartAgent() error {
 		Links: agent.Options.ContainerLinks,
 	}
 
+	if os.Getenv("ECS_FTEST_FORCE_NET_HOST") != "" {
+		hostConfig.NetworkMode = "host"
+	}
+
 	if agent.Options != nil {
-		// Override the default docker envrionment variable
+		// Override the default docker environment variable
 		for key, value := range agent.Options.ExtraEnvironment {
 			envVarExists := false
 			for i, str := range dockerConfig.Env {
@@ -207,12 +211,17 @@ func (agent *TestAgent) StartAgent() error {
 	if err != nil {
 		return errors.New("Could not inspect agent container: " + err.Error())
 	}
-	agent.IntrospectionURL = "http://localhost:" + containerMetadata.NetworkSettings.Ports["51678/tcp"][0].HostPort
+	if containerMetadata.HostConfig.NetworkMode == "host" {
+		agent.IntrospectionURL = "http://localhost:51678"
+	} else {
+		agent.IntrospectionURL = "http://localhost:" + containerMetadata.NetworkSettings.Ports["51678/tcp"][0].HostPort
+	}
+
 	return agent.verifyIntrospectionAPI()
 }
 
 // getBindMounts actually constructs volume binds for container's host config
-// It also additionally checks for envrionment variables:
+// It also additionally checks for environment variables:
 // * CGROUP_PATH: the cgroup path
 // * EXECDRIVER_PATH: the path of metrics
 func (agent *TestAgent) getBindMounts() []string {