Skip to content

Commit e6b9f2a

Browse files
committed
Fix bug that could incorrectly clean up pause contaier before other containers
1 parent 76bcf85 commit e6b9f2a

File tree

3 files changed

+131
-33
lines changed

3 files changed

+131
-33
lines changed

agent/engine/docker_task_engine.go

+21-6
Original file line numberDiff line numberDiff line change
@@ -1425,15 +1425,12 @@ func (engine *DockerTaskEngine) checkTearDownPauseContainer(task *apitask.Task)
14251425
}
14261426
for _, container := range task.Containers {
14271427
// Cleanup the pause container network namespace before stop the container
1428-
if container.Type == apicontainer.ContainerCNIPause && !container.IsContainerTornDown() {
1428+
if container.Type == apicontainer.ContainerCNIPause {
14291429
// Clean up if the pause container has stopped or will stop
14301430
if container.KnownTerminal() || container.DesiredTerminal() {
14311431
err := engine.cleanupPauseContainerNetwork(task, container)
14321432
if err != nil {
14331433
seelog.Errorf("Task engine [%s]: unable to cleanup pause container network namespace: %v", task.Arn, err)
1434-
} else {
1435-
container.SetContainerTornDown(true)
1436-
seelog.Infof("Task engine [%s]: cleaned pause container network namespace", task.Arn)
14371434
}
14381435
}
14391436
return
@@ -1443,6 +1440,10 @@ func (engine *DockerTaskEngine) checkTearDownPauseContainer(task *apitask.Task)
14431440

14441441
// cleanupPauseContainerNetwork will clean up the network namespace of pause container
14451442
func (engine *DockerTaskEngine) cleanupPauseContainerNetwork(task *apitask.Task, container *apicontainer.Container) error {
1443+
// This operation is idempotent
1444+
if container.IsContainerTornDown() {
1445+
return nil
1446+
}
14461447
delay := time.Duration(engine.cfg.ENIPauseContainerCleanupDelaySeconds) * time.Second
14471448
if engine.handleDelay != nil && delay > 0 {
14481449
seelog.Infof("Task engine [%s]: waiting %s before cleaning up pause container.", task.Arn, delay)
@@ -1460,7 +1461,14 @@ func (engine *DockerTaskEngine) cleanupPauseContainerNetwork(task *apitask.Task,
14601461
"engine: failed cleanup task network namespace, task: %s", task.String())
14611462
}
14621463

1463-
return engine.cniClient.CleanupNS(engine.ctx, cniConfig, cniCleanupTimeout)
1464+
err = engine.cniClient.CleanupNS(engine.ctx, cniConfig, cniCleanupTimeout)
1465+
if err != nil {
1466+
return err
1467+
}
1468+
1469+
container.SetContainerTornDown(true)
1470+
seelog.Infof("Task engine [%s]: cleaned pause container network namespace", task.Arn)
1471+
return nil
14641472
}
14651473

14661474
// buildCNIConfigFromTaskContainer builds a CNI config for the task and container.
@@ -1513,7 +1521,14 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api
15131521
}
15141522
}
15151523

1516-
engine.checkTearDownPauseContainer(task)
1524+
// Cleanup the pause container network namespace before stop the container
1525+
if container.Type == apicontainer.ContainerCNIPause {
1526+
err := engine.cleanupPauseContainerNetwork(task, container)
1527+
if err != nil {
1528+
seelog.Errorf("Task engine [%s]: unable to cleanup pause container network namespace: %v",
1529+
task.Arn, err)
1530+
}
1531+
}
15171532

15181533
apiTimeoutStopContainer := container.GetStopTimeout()
15191534
if apiTimeoutStopContainer <= 0 {

agent/engine/docker_task_engine_test.go

+47-27
Original file line numberDiff line numberDiff line change
@@ -1137,9 +1137,11 @@ func TestPauseContainerHappyPath(t *testing.T) {
11371137
taskEngine.(*DockerTaskEngine).cniClient = cniClient
11381138
taskEngine.(*DockerTaskEngine).taskSteadyStatePollInterval = taskSteadyStatePollInterval
11391139
eventStream := make(chan dockerapi.DockerContainerChangeEvent)
1140-
sleepTask := testdata.LoadTask("sleep5")
1141-
sleepContainer := sleepTask.Containers[0]
1142-
sleepContainer.TransitionDependenciesMap = make(map[apicontainerstatus.ContainerStatus]apicontainer.TransitionDependencySet)
1140+
sleepTask := testdata.LoadTask("sleep5TwoContainers")
1141+
sleepContainer1 := sleepTask.Containers[0]
1142+
sleepContainer1.TransitionDependenciesMap = make(map[apicontainerstatus.ContainerStatus]apicontainer.TransitionDependencySet)
1143+
sleepContainer2 := sleepTask.Containers[1]
1144+
sleepContainer2.TransitionDependenciesMap = make(map[apicontainerstatus.ContainerStatus]apicontainer.TransitionDependencySet)
11431145

11441146
// Add eni information to the task so the task can add dependency of pause container
11451147
sleepTask.AddTaskENI(mockENI)
@@ -1158,6 +1160,8 @@ func TestPauseContainerHappyPath(t *testing.T) {
11581160

11591161
dockerClient.EXPECT().ContainerEvents(gomock.Any()).Return(eventStream, nil)
11601162

1163+
sleepContainerID1 := containerID + "1"
1164+
sleepContainerID2 := containerID + "2"
11611165
pauseContainerID := "pauseContainerID"
11621166
// Pause container will be launched first
11631167
gomock.InOrder(
@@ -1183,19 +1187,26 @@ func TestPauseContainerHappyPath(t *testing.T) {
11831187

11841188
// For the other container
11851189
imageManager.EXPECT().AddAllImageStates(gomock.Any()).AnyTimes()
1186-
dockerClient.EXPECT().PullImage(gomock.Any(), gomock.Any(), nil, gomock.Any()).Return(dockerapi.DockerContainerMetadata{})
1187-
imageManager.EXPECT().RecordContainerReference(gomock.Any()).Return(nil)
1188-
imageManager.EXPECT().GetImageStateFromImageName(gomock.Any()).Return(nil, false)
1189-
dockerClient.EXPECT().APIVersion().Return(defaultDockerClientAPIVersion, nil)
1190+
dockerClient.EXPECT().PullImage(gomock.Any(), gomock.Any(), nil, gomock.Any()).Return(dockerapi.DockerContainerMetadata{}).Times(2)
1191+
imageManager.EXPECT().RecordContainerReference(gomock.Any()).Return(nil).Times(2)
1192+
imageManager.EXPECT().GetImageStateFromImageName(gomock.Any()).Return(nil, false).Times(2)
1193+
dockerClient.EXPECT().APIVersion().Return(defaultDockerClientAPIVersion, nil).Times(2)
1194+
1195+
dockerClient.EXPECT().CreateContainer(gomock.Any(), gomock.Any(), gomock.Any(),
1196+
gomock.Any(), gomock.Any()).Return(dockerapi.DockerContainerMetadata{DockerID: sleepContainerID1})
11901197
dockerClient.EXPECT().CreateContainer(gomock.Any(), gomock.Any(), gomock.Any(),
1191-
gomock.Any(), gomock.Any()).Return(dockerapi.DockerContainerMetadata{DockerID: containerID})
1192-
dockerClient.EXPECT().StartContainer(gomock.Any(), containerID, defaultConfig.ContainerStartTimeout).Return(
1193-
dockerapi.DockerContainerMetadata{DockerID: containerID})
1198+
gomock.Any(), gomock.Any()).Return(dockerapi.DockerContainerMetadata{DockerID: sleepContainerID2})
1199+
1200+
dockerClient.EXPECT().StartContainer(gomock.Any(), sleepContainerID1, defaultConfig.ContainerStartTimeout).Return(
1201+
dockerapi.DockerContainerMetadata{DockerID: sleepContainerID1})
1202+
dockerClient.EXPECT().StartContainer(gomock.Any(), sleepContainerID2, defaultConfig.ContainerStartTimeout).Return(
1203+
dockerapi.DockerContainerMetadata{DockerID: sleepContainerID2})
11941204

11951205
cleanup := make(chan time.Time)
11961206
defer close(cleanup)
11971207
mockTime.EXPECT().Now().Return(time.Now()).MinTimes(1)
1198-
dockerClient.EXPECT().DescribeContainer(gomock.Any(), containerID).AnyTimes()
1208+
dockerClient.EXPECT().DescribeContainer(gomock.Any(), sleepContainerID1).AnyTimes()
1209+
dockerClient.EXPECT().DescribeContainer(gomock.Any(), sleepContainerID2).AnyTimes()
11991210
dockerClient.EXPECT().DescribeContainer(gomock.Any(), pauseContainerID).AnyTimes()
12001211

12011212
err := taskEngine.Init(ctx)
@@ -1208,27 +1219,36 @@ func TestPauseContainerHappyPath(t *testing.T) {
12081219
var wg sync.WaitGroup
12091220
wg.Add(1)
12101221
mockTime.EXPECT().After(gomock.Any()).Return(cleanup).MinTimes(1)
1211-
dockerClient.EXPECT().InspectContainer(gomock.Any(), gomock.Any(), gomock.Any()).Return(&types.ContainerJSON{
1212-
ContainerJSONBase: &types.ContainerJSONBase{
1213-
ID: pauseContainerID,
1214-
State: &types.ContainerState{Pid: containerPid},
1215-
},
1216-
}, nil)
1217-
cniClient.EXPECT().CleanupNS(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil)
1218-
dockerClient.EXPECT().StopContainer(gomock.Any(), pauseContainerID, gomock.Any()).Return(
1219-
dockerapi.DockerContainerMetadata{DockerID: pauseContainerID})
1220-
cniClient.EXPECT().ReleaseIPResource(gomock.Any(), gomock.Any(), gomock.Any()).Do(
1221-
func(ctx context.Context, cfg *ecscni.Config, timeout time.Duration) {
1222-
wg.Done()
1223-
}).Return(nil)
1224-
dockerClient.EXPECT().RemoveContainer(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).Times(2)
1225-
imageManager.EXPECT().RemoveContainerReferenceFromImageState(gomock.Any()).Return(nil)
1222+
1223+
gomock.InOrder(
1224+
dockerClient.EXPECT().StopContainer(gomock.Any(), sleepContainerID2, gomock.Any()).Return(
1225+
dockerapi.DockerContainerMetadata{DockerID: sleepContainerID2}),
1226+
1227+
dockerClient.EXPECT().InspectContainer(gomock.Any(), pauseContainerID, gomock.Any()).Return(&types.ContainerJSON{
1228+
ContainerJSONBase: &types.ContainerJSONBase{
1229+
ID: pauseContainerID,
1230+
State: &types.ContainerState{Pid: containerPid},
1231+
},
1232+
}, nil),
1233+
cniClient.EXPECT().CleanupNS(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil),
1234+
1235+
dockerClient.EXPECT().StopContainer(gomock.Any(), pauseContainerID, gomock.Any()).Return(
1236+
dockerapi.DockerContainerMetadata{DockerID: pauseContainerID}),
1237+
1238+
cniClient.EXPECT().ReleaseIPResource(gomock.Any(), gomock.Any(), gomock.Any()).Do(
1239+
func(ctx context.Context, cfg *ecscni.Config, timeout time.Duration) {
1240+
wg.Done()
1241+
}).Return(nil),
1242+
)
1243+
1244+
dockerClient.EXPECT().RemoveContainer(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).Times(3)
1245+
imageManager.EXPECT().RemoveContainerReferenceFromImageState(gomock.Any()).Return(nil).Times(2)
12261246

12271247
// Simulate a container stop event from docker
12281248
eventStream <- dockerapi.DockerContainerChangeEvent{
12291249
Status: apicontainerstatus.ContainerStopped,
12301250
DockerContainerMetadata: dockerapi.DockerContainerMetadata{
1231-
DockerID: containerID,
1251+
DockerID: sleepContainerID1,
12321252
ExitCode: aws.Int(exitCode),
12331253
},
12341254
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
{
2+
"Arn":"arn:aws:ecs:us-west-2:123456789012:task/12345678-90ab-cdef-1234-56780abcdef1",
3+
"Family":"sleep5",
4+
"Version":"2",
5+
"Containers":
6+
[
7+
{
8+
"Name":"sleep5",
9+
"Image":"busybox",
10+
"Command":["sleep","5"],
11+
"Cpu":10,
12+
"Memory":10,
13+
"Links":null,
14+
"volumesFrom":[],
15+
"mountPoints":[],
16+
"portMappings":[],
17+
"Essential":true,
18+
"EntryPoint":null,
19+
"environment":{},
20+
"overrides":{"command":null},
21+
"desiredStatus":"NONE",
22+
"KnownStatus":"NONE",
23+
"RunDependencies":null,
24+
"IsInternal":false,
25+
"AppliedStatus":"NONE",
26+
"ApplyingError":null,
27+
"SentStatus":"NONE",
28+
"KnownExitCode":null,
29+
"KnownPortBindingsUnsafe":null,
30+
"StatusLock":{}
31+
},
32+
{
33+
"Name":"sleep5-2",
34+
"Image":"busybox",
35+
"Command":["sleep","5"],
36+
"Cpu":10,
37+
"Memory":10,
38+
"Links":null,
39+
"volumesFrom":[],
40+
"mountPoints":[],
41+
"portMappings":[],
42+
"Essential":true,
43+
"EntryPoint":null,
44+
"environment":{},
45+
"overrides":{"command":null},
46+
"desiredStatus":"NONE",
47+
"KnownStatus":"NONE",
48+
"RunDependencies":null,
49+
"IsInternal":false,
50+
"AppliedStatus":"NONE",
51+
"ApplyingError":null,
52+
"SentStatus":"NONE",
53+
"KnownExitCode":null,
54+
"KnownPortBindingsUnsafe":null,
55+
"StatusLock":{}
56+
}
57+
],
58+
"volumes":[],
59+
"DesiredStatus":"RUNNING",
60+
"KnownStatus":"NONE",
61+
"KnownTime":"0001-01-01T00:00:00Z",
62+
"SentStatus":"NONE"
63+
}

0 commit comments

Comments
 (0)