Skip to content

Commit

Permalink
Add success log message after previous checkin failures (#1327) (#1342)
Browse files Browse the repository at this point in the history
(cherry picked from commit e614321)

Co-authored-by: Josh Dover <[email protected]>
  • Loading branch information
mergify[bot] and joshdover authored Sep 28, 2022
1 parent d3ee2b0 commit 5f09295
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@
- Remove fleet event reporter and events from checkin body. {issue}993[993]
- Fix unintended reset of source URI when downloading components {pull}1252[1252]
- Create separate status reporter for local only events so that degraded fleet-checkins no longer affect health on successful fleet-checkins. {issue}1157[1157] {pull}1285[1285]
- Add success log message after previous checkin failures {pull}1327[1327]

==== New features

Expand Down
16 changes: 11 additions & 5 deletions internal/pkg/agent/application/gateway/fleet/fleet_gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func (f *fleetGateway) worker() {
f.statusReporter.Update(state.Failed, errMsg, nil)
} else {
f.statusReporter.Update(state.Healthy, "", nil)
f.localReporter.Update(state.Healthy, "", nil) // we don't need to specifically set the local reporter to failed above, but it needs to be reset to healthy if a checking succeeds
f.localReporter.Update(state.Healthy, "", nil) // we don't need to specifically set the local reporter to failed above, but it needs to be reset to healthy if a checkin succeeds
}

case <-f.bgContext.Done():
Expand Down Expand Up @@ -280,11 +280,11 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) {
// Guard if the context is stopped by a out of bound call,
// this mean we are rebooting to change the log level or the system is shutting us down.
for f.bgContext.Err() == nil {
f.log.Debugf("Checking started")
f.log.Debugf("Checkin started")
resp, err := f.execute(f.bgContext)
if err != nil {
f.checkinFailCounter++
f.log.Errorf("Could not communicate with fleet-server Checking API will retry, error: %s", err)
f.log.Errorf("Could not communicate with fleet-server checkin API will retry, error: %s", err)
if !f.backoff.Wait() {
// Something bad has happened and we log it and we should update our current state.
err := errors.New(
Expand All @@ -299,10 +299,16 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) {
}
if f.checkinFailCounter > 1 {
f.localReporter.Update(state.Degraded, fmt.Sprintf("checkin failed: %v", err), nil)
f.log.Errorf("checking number %d failed: %s", f.checkinFailCounter, err.Error())
f.log.Errorf("checkin number %d failed: %s", f.checkinFailCounter, err.Error())
}
continue
}

if f.checkinFailCounter > 0 {
// Log at same level as error logs above so subsequent successes are visible when log level is set to 'error'.
f.log.Errorf("Checkin request to fleet-server succeeded after %d failures", f.checkinFailCounter)
}

f.checkinFailCounter = 0
// Request was successful, return the collected actions.
return resp, nil
Expand Down Expand Up @@ -338,7 +344,7 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse,
f.unauthCounter++

if f.shouldUnenroll() {
f.log.Warnf("retrieved an invalid api key error '%d' times. Starting to unenroll the elastic agent.", f.unauthCounter)
f.log.Warnf("received an invalid api key error '%d' times. Starting to unenroll the elastic agent.", f.unauthCounter)
return &fleetapi.CheckinResponse{
Actions: []fleetapi.Action{&fleetapi.ActionUnenroll{ActionID: "", ActionType: "UNENROLL", IsDetected: true}},
}, nil
Expand Down

0 comments on commit 5f09295

Please sign in to comment.