Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docker stop timeout buffer increased from 30s to 2m #2697

Merged
merged 2 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions agent/dockerclient/dockerapi/docker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ const (
)

// Timelimits for docker operations enforced above docker
// TODO: Make these limits configurable.
const (
// Parameters for caching the docker auth for ECR
tokenCacheSize = 100
Expand All @@ -90,7 +89,12 @@ const (
pollStatsTimeout = 18 * time.Second
)

var ctxTimeoutStopContainer = dockerclient.StopContainerTimeout
// stopContainerTimeoutBuffer is a buffer added to the timeout passed into the docker
// StopContainer api call. The reason for this buffer is that when the regular "stop"
// command fails, the docker api falls back to other kill methods, such as a containerd
// kill and SIGKILL. This buffer adds time onto the context timeout to allow time
// for these backup kill methods to finish.
var stopContainerTimeoutBuffer = 2 * time.Minute

type inactivityTimeoutHandlerFunc func(reader io.ReadCloser, timeout time.Duration, cancelRequest func(), canceled *uint32) (io.ReadCloser, chan<- struct{})

Expand Down Expand Up @@ -659,9 +663,7 @@ func (dg *dockerGoClient) inspectContainer(ctx context.Context, dockerID string)
}

func (dg *dockerGoClient) StopContainer(ctx context.Context, dockerID string, timeout time.Duration) DockerContainerMetadata {
// ctxTimeout is sum of timeout(applied to the StopContainer api call) and a fixed constant dockerclient.StopContainerTimeout
// the context's timeout should be greater than the sigkill timout for the StopContainer call
ctxTimeout := timeout + ctxTimeoutStopContainer
ctxTimeout := timeout + stopContainerTimeoutBuffer
ctx, cancel := context.WithTimeout(ctx, ctxTimeout)
defer cancel()
defer metrics.MetricsEngineGlobal.RecordDockerMetric("STOP_CONTAINER")()
Expand Down
10 changes: 7 additions & 3 deletions agent/dockerclient/dockerapi/docker_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,11 @@ func TestStopContainerTimeout(t *testing.T) {
cfg.DockerStopTimeout = xContainerShortTimeout
mockDockerSDK, client, _, _, _, done := dockerClientSetupWithConfig(t, cfg)
defer done()
ctxTimeoutStopContainer = xContainerShortTimeout
reset := stopContainerTimeoutBuffer
stopContainerTimeoutBuffer = xContainerShortTimeout
defer func() {
stopContainerTimeoutBuffer = reset
}()

wait := &sync.WaitGroup{}
wait.Add(1)
Expand All @@ -488,7 +492,7 @@ func TestStopContainerTimeout(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
metadata := client.StopContainer(ctx, "id", xContainerShortTimeout)
assert.Error(t, metadata.Error, "Expected error for pull timeout")
assert.Error(t, metadata.Error, "Expected error for stop timeout")
assert.Equal(t, "DockerTimeoutError", metadata.Error.(apierrors.NamedError).ErrorName())
wait.Done()
}
Expand All @@ -514,7 +518,7 @@ func TestStopContainer(t *testing.T) {
)
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
metadata := client.StopContainer(ctx, "id", dockerclient.StopContainerTimeout)
metadata := client.StopContainer(ctx, "id", client.config.DockerStopTimeout)
assert.NoError(t, metadata.Error)
assert.Equal(t, "id", metadata.DockerID)
}
Expand Down
2 changes: 0 additions & 2 deletions agent/dockerclient/timeout.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ const (
ListContainersTimeout = 10 * time.Minute
// InspectContainerTimeout is the timeout for the InspectContainer API.
InspectContainerTimeout = 30 * time.Second
// StopContainerTimeout is the timeout for the StopContainer API.
StopContainerTimeout = 30 * time.Second
// RemoveContainerTimeout is the timeout for the RemoveContainer API.
RemoveContainerTimeout = 5 * time.Minute

Expand Down
4 changes: 2 additions & 2 deletions agent/engine/docker_task_engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ func TestSteadyStatePoll(t *testing.T) {
dockerapi.DockerContainerMetadata{
DockerID: containerID,
}).AnyTimes()
client.EXPECT().StopContainer(gomock.Any(), containerID, dockerclient.StopContainerTimeout).AnyTimes()
client.EXPECT().StopContainer(gomock.Any(), containerID, 30*time.Second).AnyTimes()

err := taskEngine.Init(ctx) // start the task engine
assert.NoError(t, err)
Expand Down Expand Up @@ -869,7 +869,7 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
containerStopTimeoutError := dockerapi.DockerContainerMetadata{
Error: &dockerapi.DockerTimeoutError{
Transition: "stop",
Duration: dockerclient.StopContainerTimeout,
Duration: 30 * time.Second,
},
}
dockerEventSent := make(chan int)
Expand Down
2 changes: 1 addition & 1 deletion agent/engine/engine_sudo_linux_integ_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ import (

const (
testLogSenderImage = "amazonlinux:2"
testFluentbitImage = "amazon/aws-for-fluent-bit:latest"
testFluentbitImage = "amazon/aws-for-fluent-bit:2.7.0"
testVolumeImage = "127.0.0.1:51670/amazon/amazon-ecs-volumes-test:latest"
testCluster = "testCluster"
validTaskArnPrefix = "arn:aws:ecs:region:account-id:task/"
Expand Down