Skip to content

Commit e45a20b

Browse files
committed
engine: move pull retries inside dockerGoClient
1 parent 4d547a7 commit e45a20b

File tree

5 files changed

+72
-69
lines changed

5 files changed

+72
-69
lines changed

agent/engine/docker_container_engine.go

+46-14
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,24 @@ package engine
1616
import (
1717
"archive/tar"
1818
"bufio"
19+
"context"
1920
"encoding/json"
2021
"io"
2122
"strings"
2223
"sync"
2324
"time"
2425

25-
"golang.org/x/net/context"
26-
2726
"github.com/aws/amazon-ecs-agent/agent/api"
2827
"github.com/aws/amazon-ecs-agent/agent/config"
2928
"github.com/aws/amazon-ecs-agent/agent/ecr"
3029
"github.com/aws/amazon-ecs-agent/agent/engine/dockerauth"
3130
"github.com/aws/amazon-ecs-agent/agent/engine/dockerclient"
3231
"github.com/aws/amazon-ecs-agent/agent/engine/dockeriface"
3332
"github.com/aws/amazon-ecs-agent/agent/engine/emptyvolume"
33+
"github.com/aws/amazon-ecs-agent/agent/utils"
3434
"github.com/aws/amazon-ecs-agent/agent/utils/ttime"
35-
"github.com/cihub/seelog"
3635

36+
"github.com/cihub/seelog"
3737
docker "github.com/fsouza/go-dockerclient"
3838
)
3939

@@ -71,6 +71,13 @@ const (
7171
// StatsInactivityTimeout controls the amount of time we hold open a
7272
// connection to the Docker daemon waiting for stats data
7373
StatsInactivityTimeout = 5 * time.Second
74+
75+
// retry settings for pulling images
76+
maximumPullRetries = 10
77+
minimumPullRetryDelay = 250 * time.Millisecond
78+
maximumPullRetryDelay = 1 * time.Second
79+
pullRetryDelayMultiplier = 1.5
80+
pullRetryJitterMultiplier = 0.2
7481
)
7582

7683
// DockerClient interface to make testing it easier
@@ -218,38 +225,63 @@ func (dg *dockerGoClient) time() ttime.Time {
218225
}
219226

220227
func (dg *dockerGoClient) PullImage(image string, authData *api.RegistryAuthenticationData) DockerContainerMetadata {
228+
// TODO Switch to just using context.WithDeadline and get rid of this funky code
221229
timeout := dg.time().After(pullImageTimeout)
230+
ctx, cancel := context.WithCancel(context.TODO())
222231

223232
response := make(chan DockerContainerMetadata, 1)
224-
go func() { response <- dg.pullImage(image, authData) }()
233+
go func() {
234+
imagePullBackoff := utils.NewSimpleBackoff(minimumPullRetryDelay, maximumPullRetryDelay, pullRetryJitterMultiplier, pullRetryDelayMultiplier)
235+
err := utils.RetryNWithBackoffCtx(ctx, imagePullBackoff, maximumPullRetries, func() error {
236+
err := dg.pullImage(image, authData)
237+
if err != nil {
238+
seelog.Warn("Failed to pull image %s: %s", image, err.Error())
239+
}
240+
return err
241+
})
242+
response <- DockerContainerMetadata{Error: wrapPullErrorAsEngineError(err)}
243+
}()
225244
select {
226245
case resp := <-response:
227246
return resp
228247
case <-timeout:
248+
cancel()
229249
return DockerContainerMetadata{Error: &DockerTimeoutError{pullImageTimeout, "pulled"}}
230250
}
231251
}
232252

233-
func (dg *dockerGoClient) pullImage(image string, authData *api.RegistryAuthenticationData) DockerContainerMetadata {
253+
func wrapPullErrorAsEngineError(err error) engineError {
254+
var retErr engineError
255+
if err != nil {
256+
engErr, ok := err.(engineError)
257+
if !ok {
258+
engErr = CannotPullContainerError{err}
259+
}
260+
retErr = engErr
261+
}
262+
return retErr
263+
}
264+
265+
func (dg *dockerGoClient) pullImage(image string, authData *api.RegistryAuthenticationData) engineError {
234266
log.Debug("Pulling image", "image", image)
235267
client, err := dg.dockerClient()
236268
if err != nil {
237-
return DockerContainerMetadata{Error: CannotGetDockerClientError{version: dg.version, err: err}}
269+
return CannotGetDockerClientError{version: dg.version, err: err}
238270
}
239271

240272
// Special case; this image is not one that should be pulled, but rather
241273
// should be created locally if necessary
242274
if image == emptyvolume.Image+":"+emptyvolume.Tag {
243275
scratchErr := dg.createScratchImageIfNotExists()
244276
if scratchErr != nil {
245-
return DockerContainerMetadata{Error: &api.DefaultNamedError{Name: "CreateEmptyVolumeError", Err: "Could not create empty volume " + scratchErr.Error()}}
277+
return &api.DefaultNamedError{Name: "CreateEmptyVolumeError", Err: "Could not create empty volume " + scratchErr.Error()}
246278
}
247-
return DockerContainerMetadata{}
279+
return nil
248280
}
249281

250282
authConfig, err := dg.getAuthdata(image, authData)
251283
if err != nil {
252-
return DockerContainerMetadata{Error: CannotPullContainerError{err}}
284+
return wrapPullErrorAsEngineError(err)
253285
}
254286

255287
pullDebugOut, pullWriter := io.Pipe()
@@ -314,20 +346,20 @@ func (dg *dockerGoClient) pullImage(image string, authData *api.RegistryAuthenti
314346
break
315347
case pullErr := <-pullFinished:
316348
if pullErr != nil {
317-
return DockerContainerMetadata{Error: CannotPullContainerError{pullErr}}
349+
return CannotPullContainerError{pullErr}
318350
}
319-
return DockerContainerMetadata{}
351+
return nil
320352
case <-timeout:
321-
return DockerContainerMetadata{Error: &DockerTimeoutError{dockerPullBeginTimeout, "pullBegin"}}
353+
return &DockerTimeoutError{dockerPullBeginTimeout, "pullBegin"}
322354
}
323355
log.Debug("Pull began for image", "image", image)
324356
defer log.Debug("Pull completed for image", "image", image)
325357

326358
err = <-pullFinished
327359
if err != nil {
328-
return DockerContainerMetadata{Error: CannotPullContainerError{err}}
360+
return CannotPullContainerError{err}
329361
}
330-
return DockerContainerMetadata{}
362+
return nil
331363
}
332364

333365
func (dg *dockerGoClient) createScratchImageIfNotExists() error {

agent/engine/docker_container_engine_test.go

+17-36
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"github.com/aws/amazon-ecs-agent/agent/engine/dockeriface/mocks"
4141
"github.com/aws/amazon-ecs-agent/agent/engine/emptyvolume"
4242
"github.com/aws/amazon-ecs-agent/agent/utils/ttime/mocks"
43+
"github.com/stretchr/testify/require"
4344
)
4445

4546
// xContainerShortTimeout is a short duration intended to be used by the
@@ -90,15 +91,16 @@ func TestPullImageOutputTimeout(t *testing.T) {
9091
defer done()
9192

9293
pullBeginTimeout := make(chan time.Time)
93-
testTime.EXPECT().After(dockerPullBeginTimeout).Return(pullBeginTimeout)
94-
testTime.EXPECT().After(pullImageTimeout)
94+
testTime.EXPECT().After(dockerPullBeginTimeout).Return(pullBeginTimeout).MinTimes(1)
95+
testTime.EXPECT().After(pullImageTimeout).MinTimes(1)
9596
wait := sync.WaitGroup{}
9697
wait.Add(1)
98+
// multiple invocations will happen due to retries, but all should timeout
9799
mockDocker.EXPECT().PullImage(&pullImageOptsMatcher{"image:latest"}, gomock.Any()).Do(func(x, y interface{}) {
98100
pullBeginTimeout <- time.Now()
99101
wait.Wait()
100102
// Don't return, verify timeout happens
101-
})
103+
}).Times(maximumPullRetries) // expected number of retries
102104

103105
metadata := client.PullImage("image", nil)
104106
if metadata.Error == nil {
@@ -159,9 +161,7 @@ func TestPullImage(t *testing.T) {
159161
mockDocker.EXPECT().PullImage(&pullImageOptsMatcher{"image:latest"}, gomock.Any()).Return(nil)
160162

161163
metadata := client.PullImage("image", nil)
162-
if metadata.Error != nil {
163-
t.Error("Expected pull to succeed")
164-
}
164+
assert.NoError(t, metadata.Error, "Expected pull to succeed")
165165
}
166166

167167
func TestPullImageTag(t *testing.T) {
@@ -172,9 +172,7 @@ func TestPullImageTag(t *testing.T) {
172172
mockDocker.EXPECT().PullImage(&pullImageOptsMatcher{"image:mytag"}, gomock.Any()).Return(nil)
173173

174174
metadata := client.PullImage("image:mytag", nil)
175-
if metadata.Error != nil {
176-
t.Error("Expected pull to succeed")
177-
}
175+
assert.NoError(t, metadata.Error, "Expected pull to succeed")
178176
}
179177

180178
func TestPullImageDigest(t *testing.T) {
@@ -188,9 +186,7 @@ func TestPullImageDigest(t *testing.T) {
188186
).Return(nil)
189187

190188
metadata := client.PullImage("image@sha256:bc8813ea7b3603864987522f02a76101c17ad122e1c46d790efc0fca78ca7bfb", nil)
191-
if metadata.Error != nil {
192-
t.Error("Expected pull to succeed")
193-
}
189+
assert.NoError(t, metadata.Error, "Expected pull to succeed")
194190
}
195191

196192
func TestPullEmptyvolumeImage(t *testing.T) {
@@ -203,19 +199,13 @@ func TestPullEmptyvolumeImage(t *testing.T) {
203199
mockDocker.EXPECT().InspectImage(emptyvolume.Image+":"+emptyvolume.Tag).Return(nil, errors.New("Does not exist")),
204200
mockDocker.EXPECT().ImportImage(gomock.Any()).Do(func(x interface{}) {
205201
req := x.(docker.ImportImageOptions)
206-
if req.Repository != emptyvolume.Image {
207-
t.Fatal("Expected empty volume repository")
208-
}
209-
if req.Tag != emptyvolume.Tag {
210-
t.Fatal("Expected empty volume repository")
211-
}
202+
require.Equal(t, emptyvolume.Image, req.Repository, "expected empty volume repository")
203+
require.Equal(t, emptyvolume.Tag, req.Tag, "expected empty volume tag")
212204
}),
213205
)
214206

215207
metadata := client.PullImage(emptyvolume.Image+":"+emptyvolume.Tag, nil)
216-
if metadata.Error != nil {
217-
t.Error(metadata.Error)
218-
}
208+
assert.NoError(t, metadata.Error, "Expected pull to succeed")
219209
}
220210

221211
func TestPullExistingEmptyvolumeImage(t *testing.T) {
@@ -229,9 +219,7 @@ func TestPullExistingEmptyvolumeImage(t *testing.T) {
229219
)
230220

231221
metadata := client.PullImage(emptyvolume.Image+":"+emptyvolume.Tag, nil)
232-
if metadata.Error != nil {
233-
t.Error(metadata.Error)
234-
}
222+
assert.NoError(t, metadata.Error, "Expected pull to succeed")
235223
}
236224

237225
func TestPullImageECRSuccess(t *testing.T) {
@@ -285,9 +273,7 @@ func TestPullImageECRSuccess(t *testing.T) {
285273
).Return(nil)
286274

287275
metadata := client.PullImage(image, authData)
288-
if metadata.Error != nil {
289-
t.Error("Expected pull to succeed")
290-
}
276+
assert.NoError(t, metadata.Error, "Expected pull to succeed")
291277
}
292278

293279
func TestPullImageECRAuthFail(t *testing.T) {
@@ -322,12 +308,11 @@ func TestPullImageECRAuthFail(t *testing.T) {
322308
image := imageEndpoint + "/myimage:tag"
323309

324310
ecrClientFactory.EXPECT().GetClient(region, endpointOverride).Return(ecrClient)
311+
// no retries for this error
325312
ecrClient.EXPECT().GetAuthorizationToken(gomock.Any()).Return(nil, errors.New("test error"))
326313

327314
metadata := client.PullImage(image, authData)
328-
if metadata.Error == nil {
329-
t.Error("Expected pull to fail")
330-
}
315+
assert.Error(t, metadata.Error, "expected pull to fail")
331316
}
332317

333318
func TestCreateContainerTimeout(t *testing.T) {
@@ -344,12 +329,8 @@ func TestCreateContainerTimeout(t *testing.T) {
344329
// Don't return, verify timeout happens
345330
})
346331
metadata := client.CreateContainer(config.Config, nil, config.Name, xContainerShortTimeout)
347-
if metadata.Error == nil {
348-
t.Error("Expected error for pull timeout")
349-
}
350-
if metadata.Error.(api.NamedError).ErrorName() != "DockerTimeoutError" {
351-
t.Error("Wrong error type")
352-
}
332+
assert.Error(t, metadata.Error, "expected error for pull timeout")
333+
assert.Equal(t, "DockerTimeoutError", metadata.Error.(api.NamedError).ErrorName())
353334
wait.Done()
354335
}
355336

agent/engine/docker_task_engine.go

+1-17
Original file line numberDiff line numberDiff line change
@@ -553,23 +553,7 @@ func (engine *DockerTaskEngine) pullAndUpdateContainerReference(task *api.Task,
553553
return DockerContainerMetadata{Error: TaskStoppedBeforePullBeginError{task.Arn}}
554554
}
555555

556-
var metadata DockerContainerMetadata
557-
imagePullBackoff := utils.NewSimpleBackoff(250*time.Millisecond, time.Second, 0.20, 1.5)
558-
_ = utils.RetryNWithBackoff(imagePullBackoff, 10, func() error {
559-
metadata = engine.client.PullImage(container.Image, container.RegistryAuthentication)
560-
if container.IsInternal() {
561-
// "internal" containers are created by the agent and we don't expect to actually pull them
562-
return nil
563-
}
564-
if imagePullError, ok := metadata.Error.(CannotPullContainerError); ok {
565-
seelog.Warnf("When pulling %s: %s", container.Image, imagePullError)
566-
return imagePullError
567-
} else {
568-
// If the pull fails with something other than CannotPullContainerError, or if it is successful, we stop
569-
// retrying
570-
return nil
571-
}
572-
})
556+
metadata := engine.client.PullImage(container.Image, container.RegistryAuthentication)
573557

574558
// Don't add internal images(created by ecs-agent) into imagemanger state
575559
if container.IsInternal() {

agent/engine/engine_mocks.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package engine
1818

1919
import (
20+
context0 "context"
2021
io "io"
2122
time "time"
2223

@@ -180,7 +181,7 @@ func (_m *MockDockerClient) EXPECT() *_MockDockerClientRecorder {
180181
return _m.recorder
181182
}
182183

183-
func (_m *MockDockerClient) ContainerEvents(_param0 context.Context) (<-chan DockerContainerChangeEvent, error) {
184+
func (_m *MockDockerClient) ContainerEvents(_param0 context0.Context) (<-chan DockerContainerChangeEvent, error) {
184185
ret := _m.ctrl.Call(_m, "ContainerEvents", _param0)
185186
ret0, _ := ret[0].(<-chan DockerContainerChangeEvent)
186187
ret1, _ := ret[1].(error)
@@ -304,7 +305,7 @@ func (_mr *_MockDockerClientRecorder) StartContainer(arg0, arg1 interface{}) *go
304305
return _mr.mock.ctrl.RecordCall(_mr.mock, "StartContainer", arg0, arg1)
305306
}
306307

307-
func (_m *MockDockerClient) Stats(_param0 string, _param1 context.Context) (<-chan *go_dockerclient.Stats, error) {
308+
func (_m *MockDockerClient) Stats(_param0 string, _param1 context0.Context) (<-chan *go_dockerclient.Stats, error) {
308309
ret := _m.ctrl.Call(_m, "Stats", _param0, _param1)
309310
ret0, _ := ret[0].(<-chan *go_dockerclient.Stats)
310311
ret1, _ := ret[1].(error)

agent/engine/errors.go

+5
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ func (err CannotPullECRContainerError) ErrorName() string {
204204
return "CannotPullECRContainerError"
205205
}
206206

207+
// Retry fulfills the utils.Retrier interface and allows retries to be skipped by utils.Retry* functions
208+
func (err CannotPullECRContainerError) Retry() bool {
209+
return false
210+
}
211+
207212
// CannotCreateContainerError indicates any error when trying to create a container
208213
type CannotCreateContainerError struct {
209214
fromError error

0 commit comments

Comments
 (0)