Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dockerstate: map keys differ before create #1033

Merged
merged 5 commits into from
Oct 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
[#1014](https://github.com/aws/amazon-ecs-agent/pull/1014)
* Enhancement - Support `init` process in containers by adding support for Docker remote API client version 1.25
[#996](https://github.com/aws/amazon-ecs-agent/pull/996)
* Bug - Fixed a bug where tasks that fail to pull containers can cause the
agent to fail to restore properly after a restart.
[#1033](https://github.com/aws/amazon-ecs-agent/pull/1033)

## 1.14.5
* Enhancement - Retry failed container image pull operations [#975](https://github.com/aws/amazon-ecs-agent/pull/975)
Expand Down
53 changes: 38 additions & 15 deletions agent/engine/dockerstate/docker_task_engine_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,19 +318,7 @@ func (state *DockerTaskEngineState) AddContainer(container *api.DockerContainer,
state.tasks[task.Arn] = task
}

if container.DockerID != "" {
// Update the container id to the state
state.idToContainer[container.DockerID] = container
state.idToTask[container.DockerID] = task.Arn

// Remove the previously added name mapping
delete(state.idToContainer, container.DockerName)
delete(state.idToTask, container.DockerName)
} else if container.DockerName != "" {
// Update the container name mapping to the state when the ID isn't available
state.idToContainer[container.DockerName] = container
state.idToTask[container.DockerName] = task.Arn
}
state.storeIDToContainerTaskUnsafe(container, task)

existingMap, exists := state.taskToID[task.Arn]
if !exists {
Expand Down Expand Up @@ -364,19 +352,54 @@ func (state *DockerTaskEngineState) RemoveTask(task *api.Task) {

task, ok := state.tasks[task.Arn]
if !ok {
seelog.Warnf("Failed to locate task %s for removal from state", task.Arn)
return
}
delete(state.tasks, task.Arn)

containerMap, ok := state.taskToID[task.Arn]
if !ok {
seelog.Warnf("Failed to locate containerMap for task %s for removal from state", task.Arn)
return
}
delete(state.taskToID, task.Arn)

for _, dockerContainer := range containerMap {
delete(state.idToTask, dockerContainer.DockerID)
delete(state.idToContainer, dockerContainer.DockerID)
state.removeIDToContainerTaskUnsafe(dockerContainer)
}
}

// storeIDToContainerTaskUnsafe stores the container in the idToContainer and idToTask maps. The key to the maps is
// either the Docker-generated ID or the agent-generated name (if the ID is not available). If the container is updated
// with an ID, a subsequent call to this function will update the map to use the ID as the key.
func (state *DockerTaskEngineState) storeIDToContainerTaskUnsafe(container *api.DockerContainer, task *api.Task) {
if container.DockerID != "" {
// Update the container id to the state
state.idToContainer[container.DockerID] = container
state.idToTask[container.DockerID] = task.Arn

// Remove the previously added name mapping
delete(state.idToContainer, container.DockerName)
delete(state.idToTask, container.DockerName)
} else if container.DockerName != "" {
// Update the container name mapping to the state when the ID isn't available
state.idToContainer[container.DockerName] = container
state.idToTask[container.DockerName] = task.Arn
}
}

// removeIDToContainerTaskUnsafe removes the container from the idToContainer and idToTask maps. They key to the maps
// is either the Docker-generated ID or the agent-generated name (if the ID is not available). This function assumes
// that the ID takes precedence and will delete by the ID when the ID is available.
func (state *DockerTaskEngineState) removeIDToContainerTaskUnsafe(container *api.DockerContainer) {
// The key to these maps is either the Docker ID or agent-generated name. We use the agent-generated name
// before a Docker ID is available.
key := container.DockerID
if key == "" {
key = container.DockerName
}
delete(state.idToTask, key)
delete(state.idToContainer, key)
}

// RemoveImageState removes an image.ImageState
Expand Down
36 changes: 23 additions & 13 deletions agent/engine/dockerstate/dockerstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,32 +171,42 @@ func TestTwophaseAddContainer(t *testing.T) {

func TestRemoveTask(t *testing.T) {
state := NewTaskEngineState()
testContainer := &api.Container{
testContainer1 := &api.Container{
Name: "c1",
}
testDockerContainer := &api.DockerContainer{
testDockerContainer1 := &api.DockerContainer{
DockerID: "did",
Container: testContainer,
Container: testContainer1,
}
testContainer2 := &api.Container{
Name: "c2",
}
testDockerContainer2 := &api.DockerContainer{
// DockerName is used before the DockerID is assigned
DockerName: "docker-name-2",
Container: testContainer2,
}
testTask := &api.Task{
Arn: "t1",
Containers: []*api.Container{testContainer},
Containers: []*api.Container{testContainer1, testContainer2},
}

state.AddTask(testTask)
state.AddContainer(testDockerContainer, testTask)
state.AddContainer(testDockerContainer1, testTask)
state.AddContainer(testDockerContainer2, testTask)

tasks := state.AllTasks()
if len(tasks) != 1 {
t.Error("Expected only 1 task")
}
engineState := state.(*DockerTaskEngineState)

assert.Len(t, state.AllTasks(), 1, "Expected one task")
assert.Len(t, engineState.idToTask, 2, "idToTask map should have two entries")
assert.Len(t, engineState.idToContainer, 2, "idToContainer map should have two entries")

state.RemoveTask(testTask)

tasks = state.AllTasks()
if len(tasks) != 0 {
t.Error("Expected task to be removed")
}
assert.Len(t, state.AllTasks(), 0, "Expected task to be removed")
assert.Len(t, engineState.idToTask, 0, "idToTask map should be empty")
assert.Len(t, engineState.idToContainer, 0, "idToContainer map should be empty")

}

func TestAddImageState(t *testing.T) {
Expand Down
16 changes: 6 additions & 10 deletions agent/engine/dockerstate/testutils/json_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"github.com/aws/amazon-ecs-agent/agent/api"
"github.com/aws/amazon-ecs-agent/agent/engine/dockerstate"
"github.com/stretchr/testify/assert"
)

func createTestContainer(num int) *api.Container {
Expand Down Expand Up @@ -50,14 +51,12 @@ func createTestTask(arn string, numContainers int) *api.Task {

func decodeEqual(t *testing.T, state dockerstate.TaskEngineState) dockerstate.TaskEngineState {
data, err := json.Marshal(&state)
if err != nil {
t.Error(err)
}
assert.NoError(t, err, "marshal state")

otherState := dockerstate.NewTaskEngineState()
err = json.Unmarshal(data, &otherState)
if err != nil {
t.Error(err)
}
assert.NoError(t, err, "unmarshal state")

if !DockerStatesEqual(state, otherState) {
debug.PrintStack()
t.Error("States were not equal")
Expand All @@ -77,8 +76,5 @@ func TestJsonEncoding(t *testing.T) {
}
other := decodeEqual(t, testState)
_, ok := other.ContainerMapByArn("test1")
if !ok {
t.Error("Could not retrieve expected task")
}

assert.True(t, ok, "could not retrieve expected task")
}
104 changes: 70 additions & 34 deletions agent/functional_tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,49 +14,85 @@ It is not recommended to run these without understanding the implications of
doing so and it is certainly not recommended to run them on an AWS account
handling production work-loads.

## Setup
## Test setup

Before running these tests, you should build the ECS Agent and tag its image as
### Linux

Before running these tests, you should build the ECS agent and tag its image as
`amazon/amazon-ecs-agent:make`.

You should also run the registry the tests pull from. This can most easily be done via `make test-registry`.

#### Environment variables
In order to run telemetry functional test in non Amazon Linux AMI environment
with older versions of the ECS agent (pre-1.10.0), the following environment
variables should be set:
* `CGROUP_PATH`: cgroup path on the host, default value "/cgroup"
* `EXECDRIVER_PATH`: execdriver path on the host, default value "/var/run/docker/execdriver"

You can configure the following environment variables to change test
execution behavior:
* `AWS_REGION`: Control the region that is used for test execution
* `ECS_CLUSTER`: Control the cluster used for test execution
* `ECS_AGENT_IMAGE`: Override the default image of
`amazon/amazon-ecs-agent:make`
* `ECS_FTEST_TMP`: Override the default temporary directory used for storing
test logs and data files
* `ECS_FTEST_AGENT_ARGS`: Pass additional command-line arguments to the agent
* `ECS_FTEST_FORCE_NET_HOST`: Run the agent with `--net=host`

#### Additional setup for IAM roles
In order to run TaskIamRole functional tests, the following steps should b
done first:
* Run command: `sysctl -w net.ipv4.conf.all.route_localnet=1` and
`iptables -t nat -A PREROUTING -p tcp -d 169.254.170.2 --dport 80 -j DNAT --to-destination 127.0.0.1:51679`.
* Set the environment variable to enable the test under default network mode:
`export TEST_TASK_IAM_ROLE=true`.
* Set the environment variable of IAM roles the test will use:
`export TASK_IAM_ROLE_ARN="iam role arn"`. The role should have the
`ec2:DescribeRegions` permission and have a trust relationship with
"ecs-tasks.amazonaws.com". If the `TASK_IAM_ROLE_ARN` environment variable
isn't set, the test will use the IAM role attached to the instance profile.
In this case, the IAM role should additionally have the
`iam:GetInstanceProfile` permission.
* Testing under net=host network mode requires additional commands:
`iptables -t nat -A OUTPUT -d 169.254.170.2 -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 51679`
and `export TEST_TASK_IAM_ROLE_NET_HOST=true`

### Windows

Before running these tests, you should build the ECS agent (as `agent.exe`) and
record the directory where the binary is present in the `ECS_WINDOWS_TEST_DIR`
environment variable.

#### Environment variables
You can configure the following environment variables to change test
execution behavior:
* `AWS_REGION`: Control the region that is used for test execution
* `ECS_CLUSTER`: Control the cluster used for test execution
* `ECS_WINDOWS_TEST_DIR`: Override the path used to find `agent.exe`
* `ECS_FTEST_TMP`: Override the default temporary directory used for storing
test logs and data files

#### Additional setup for IAM roles

For performing the IAM roles test, perform the following additional tasks.
* `$env:TEST_TASK_IAM_ROLE="true"`
* devcon.exe should be present in system environment variable `PATH`.


## Running

These tests should be run on an EC2 instance able to correctly run the ECS
agent and access the ECS APIs.

The best way to run them is via the `make run-functional-tests` target.

Thay may also be manually run with `go test -tags functional -v ./...`,

### Envrionment Variable
In order to run Telemetry functional test in non Amazon Linux AMI environment,
the following environment variables should be set:
* CGROUP_PATH: cgroup path on the host, default value "/cgroup"
* EXECDRIVER_PATH: execdriver path on the host, default value "/var/run/docker/execdriver"

In order to run TaskIamRole functional test, the following steps should be done first:
* Run command: `sysctl -w net.ipv4.conf.all.route_localnet=1` and
`iptables -t nat -A PREROUTING -p tcp -d 169.254.170.2 --dport 80 -j DNAT --to-destination 127.0.0.1:51679`.
* Set the environment variable to enable the test under default network mode: `export TEST_TASK_IAM_ROLE=true`.
* Set the envrionment variable of IAM roles the test will use: `export TASK_IAM_ROLE_ARN="iam role arn"`,
the role should have the `ec2:DescribeRegions` permission and have the trust relationship with "ecs-tasks.amazonaws.com".
Or if the `TASK_IAM_ROLE_ARN` isn't set, it will use the IAM role attached to the instance profile. In this case,
except the permissions required before, the IAM role should also have `iam:GetInstanceProfile` permission.
* Testing under net=host network mode requires additional command:
`iptables -t nat -A OUTPUT -d 169.254.170.2 -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 51679` and
`export TEST_TASK_IAM_ROLE_NET_HOST=true`

## Windows Setup

Set the following environment variable.
* $env:ECS_WINDOWS_TEST_DIR=##Path where the agent binary is present.
* For performing the IAM roles test, perform the following additional tasks.
** `$env:TEST_TASK_IAM_ROLE="true"`
** `Set environment variable AWS_REGION. For example, `$env:AWS_REGION="us-east-1"`
** devcon.exe should be present in system environment variable PATH.
The best way to run them on Linux is via the `make run-functional-tests`
target.

## Running
The best way to run them on Windows is via the `run-functional-tests.ps1`
script located in the `scripts` directory.

Running the command run-functional-tests.ps1 from the scripts directory
They may also be manually run with `go test -tags functional -v ./...` and
you can use standard `go test` flags to control which tests execute (for
example, you can use the `-run` flag to provide a regular expression of tests
to include).
37 changes: 37 additions & 0 deletions agent/functional_tests/tests/functionaltests_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,43 @@ func TestSavedState(t *testing.T) {
testTask.WaitStopped(1 * time.Minute)
}

// TestSavedStateWithInvalidImageAndCleanup verifies that a task definition with an invalid image does not prevent the
// agnet from starting again after the task has been cleaned up. See
// https://github.com/aws/amazon-ecs-agent/issues/1024 for details.
func TestSavedStateWithInvalidImageAndCleanup(t *testing.T) {
// Set the task cleanup time to just over a minute.
os.Setenv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION", "70s")
agent := RunAgent(t, nil)
defer func() {
agent.Cleanup()
os.Unsetenv("ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION")
}()

testTask, err := agent.StartTask(t, "invalid-image")
require.NoError(t, err, "failed to start task")
assert.NoError(t, testTask.ExpectErrorType("error", "CannotPullContainerError", 1*time.Minute))

resp, err := agent.CallTaskIntrospectionAPI(testTask)
assert.NoError(t, err, "should be able to introspect the task")
assert.NotNil(t, resp, "should receive a response")
assert.Equal(t, *testTask.TaskArn, resp.Arn, "arn should be equal")

// wait two minutes for it to be cleaned up
fmt.Println("Sleeping...")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: remove this? Or use a logger?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fmt.Println will print out immediately (when running tests for a single package) while t.Log buffers until the test ends. I had wanted to print this out during the run so I could inspect the agent logs while it was sleeping. I'm inclined to leave this in right now, but I can remove it if you feel strongly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nah, you can leave this be. It's functional tests and it doesn't matter if log out to stdout here immediately.

time.Sleep(2 * time.Minute)

resp, err = agent.CallTaskIntrospectionAPI(testTask)
assert.NoError(t, err, "should be able to call introspection api") // is there a reason we don't 404?
assert.NotNil(t, resp, "should receive a response") // why?
assert.Equal(t, "", resp.Arn, "arn is blank")

err = agent.StopAgent()
require.NoError(t, err, "failed to stop agent")

err = agent.StartAgent()
require.NoError(t, err, "failed to start agent again")
}

// TestPortResourceContention verifies that running two tasks on the same port
// in quick-succession does not result in the second one failing to run. It
// verifies the 'seqnum' serialization stuff works.
Expand Down
11 changes: 11 additions & 0 deletions agent/functional_tests/util/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,17 @@ func (agent *TestAgent) waitRunningViaIntrospection(task *TestTask) (bool, error
}
}

func (agent *TestAgent) CallTaskIntrospectionAPI(task *TestTask) (*handlers.TaskResponse, error) {
rawResponse, err := agent.callTaskIntrospectionApi(*task.TaskArn)
if err != nil {
return nil, err
}

var taskResp handlers.TaskResponse
err = json.Unmarshal(*rawResponse, &taskResp)
return &taskResp, err
}

func (agent *TestAgent) callTaskIntrospectionApi(taskArn string) (*[]byte, error) {
fullIntrospectionApiURL := agent.IntrospectionURL + "/v1/tasks"
if taskArn != "" {
Expand Down
15 changes: 12 additions & 3 deletions agent/functional_tests/util/utils_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,12 @@ func (agent *TestAgent) StartAgent() error {
Links: agent.Options.ContainerLinks,
}

if os.Getenv("ECS_FTEST_FORCE_NET_HOST") != "" {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is this environment variable being set?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not automatically set. I added this because I was developing the test in an environment where the docker0 bridge did not have Internet access (for unrelated reasons) and wanted the agent to actually be able to run.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then can you add this into the agent/functional_tests/README.d.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I rewrote a bit more of the README to make it more clear.

hostConfig.NetworkMode = "host"
}

if agent.Options != nil {
// Override the default docker envrionment variable
// Override the default docker environment variable
for key, value := range agent.Options.ExtraEnvironment {
envVarExists := false
for i, str := range dockerConfig.Env {
Expand Down Expand Up @@ -207,12 +211,17 @@ func (agent *TestAgent) StartAgent() error {
if err != nil {
return errors.New("Could not inspect agent container: " + err.Error())
}
agent.IntrospectionURL = "http://localhost:" + containerMetadata.NetworkSettings.Ports["51678/tcp"][0].HostPort
if containerMetadata.HostConfig.NetworkMode == "host" {
agent.IntrospectionURL = "http://localhost:51678"
} else {
agent.IntrospectionURL = "http://localhost:" + containerMetadata.NetworkSettings.Ports["51678/tcp"][0].HostPort
}

return agent.verifyIntrospectionAPI()
}

// getBindMounts actually constructs volume binds for container's host config
// It also additionally checks for envrionment variables:
// It also additionally checks for environment variables:
// * CGROUP_PATH: the cgroup path
// * EXECDRIVER_PATH: the path of metrics
func (agent *TestAgent) getBindMounts() []string {
Expand Down