@@ -173,7 +173,7 @@ func (f *fleetGateway) worker() {
173
173
// Execute the checkin call and for any errors returned by the fleet-server API
174
174
// the function will retry to communicate with fleet-server with an exponential delay and some
175
175
// jitter to help better distribute the load from a fleet of agents.
176
- resp , err := f .doExecute ()
176
+ resp , err := f .executeCheckinWithRetries ()
177
177
if err != nil {
178
178
continue
179
179
}
@@ -274,21 +274,34 @@ func (f *fleetGateway) gatherQueuedActions(ts time.Time) (queued, expired []flee
274
274
return queued , expired
275
275
}
276
276
277
- func (f * fleetGateway ) doExecute () (* fleetapi.CheckinResponse , error ) {
277
+ func (f * fleetGateway ) executeCheckinWithRetries () (* fleetapi.CheckinResponse , error ) {
278
278
f .backoff .Reset ()
279
279
280
280
// Guard if the context is stopped by a out of bound call,
281
281
// this mean we are rebooting to change the log level or the system is shutting us down.
282
282
for f .bgContext .Err () == nil {
283
283
f .log .Debugf ("Checkin started" )
284
- resp , err := f .execute (f .bgContext )
284
+ resp , took , err := f .executeCheckin (f .bgContext )
285
285
if err != nil {
286
286
f .checkinFailCounter ++
287
- f .log .Errorf ("Could not communicate with fleet-server checkin API will retry, error: %s" , err )
287
+
288
+ // Report the first two failures at warn level as they may be recoverable with retries.
289
+ if f .checkinFailCounter <= 2 {
290
+ f .log .Warnw ("Possible transient error during checkin with fleet-server, retrying" ,
291
+ "error.message" , err , "request_duration_ns" , took , "failed_checkins" , f .checkinFailCounter ,
292
+ "retry_after_ns" , f .backoff .NextWait ())
293
+ } else {
294
+ // Only update the local status after repeated failures: https://github.com/elastic/elastic-agent/issues/1148
295
+ f .localReporter .Update (state .Degraded , fmt .Sprintf ("checkin failed: %v" , err ), nil )
296
+ f .log .Errorw ("Cannot checkin in with fleet-server, retrying" ,
297
+ "error.message" , err , "request_duration_ns" , took , "failed_checkins" , f .checkinFailCounter ,
298
+ "retry_after_ns" , f .backoff .NextWait ())
299
+ }
300
+
288
301
if ! f .backoff .Wait () {
289
302
// Something bad has happened and we log it and we should update our current state.
290
303
err := errors .New (
291
- "execute retry loop was stopped" ,
304
+ "checkin retry loop was stopped" ,
292
305
errors .TypeNetwork ,
293
306
errors .M (errors .MetaKeyURI , f .client .URI ()),
294
307
)
@@ -297,10 +310,6 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) {
297
310
f .localReporter .Update (state .Failed , err .Error (), nil )
298
311
return nil , err
299
312
}
300
- if f .checkinFailCounter > 1 {
301
- f .localReporter .Update (state .Degraded , fmt .Sprintf ("checkin failed: %v" , err ), nil )
302
- f .log .Errorf ("checkin number %d failed: %s" , f .checkinFailCounter , err .Error ())
303
- }
304
313
continue
305
314
}
306
315
@@ -319,7 +328,7 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) {
319
328
return nil , f .bgContext .Err ()
320
329
}
321
330
322
- func (f * fleetGateway ) execute (ctx context.Context ) (* fleetapi.CheckinResponse , error ) {
331
+ func (f * fleetGateway ) executeCheckin (ctx context.Context ) (* fleetapi.CheckinResponse , time. Duration , error ) {
323
332
ecsMeta , err := info .Metadata ()
324
333
if err != nil {
325
334
f .log .Error (errors .New ("failed to load metadata" , err ))
@@ -339,23 +348,23 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse,
339
348
Status : f .statusController .StatusString (),
340
349
}
341
350
342
- resp , err := cmd .Execute (ctx , req )
351
+ resp , took , err := cmd .Execute (ctx , req )
343
352
if isUnauth (err ) {
344
353
f .unauthCounter ++
345
354
346
355
if f .shouldUnenroll () {
347
356
f .log .Warnf ("received an invalid api key error '%d' times. Starting to unenroll the elastic agent." , f .unauthCounter )
348
357
return & fleetapi.CheckinResponse {
349
358
Actions : []fleetapi.Action {& fleetapi.ActionUnenroll {ActionID : "" , ActionType : "UNENROLL" , IsDetected : true }},
350
- }, nil
359
+ }, took , nil
351
360
}
352
361
353
- return nil , err
362
+ return nil , took , err
354
363
}
355
364
356
365
f .unauthCounter = 0
357
366
if err != nil {
358
- return nil , err
367
+ return nil , took , err
359
368
}
360
369
361
370
// Save the latest ackToken
@@ -367,7 +376,7 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse,
367
376
}
368
377
}
369
378
370
- return resp , nil
379
+ return resp , took , nil
371
380
}
372
381
373
382
// shouldUnenroll checks if the max number of trying an invalid key is reached
0 commit comments