@@ -41,6 +41,10 @@ const (
41
41
// Default websocket client disconnection timeout initiated by agent
42
42
defaultDisconnectionTimeout = 15 * time .Minute
43
43
defaultDisconnectionJitter = 30 * time .Minute
44
+ backoffMin = 1 * time .Second
45
+ backoffMax = 1 * time .Minute
46
+ jitterMultiple = 0.2
47
+ multiple = 2
44
48
)
45
49
46
50
type DockerTelemetrySession struct {
@@ -107,22 +111,25 @@ func NewDockerTelemetrySession(
107
111
// discoverTelemetryEndpoint and tcshandler.TelemetrySession's StartTelemetrySession errors are handled
108
112
// (retryWithBackoff or return) in a combined manner
109
113
func (session * DockerTelemetrySession ) Start (ctx context.Context ) error {
110
- backoff := retry .NewExponentialBackoff (time . Second , 1 * time . Minute , 0.2 , 2 )
114
+ backoff := retry .NewExponentialBackoff (backoffMin , backoffMax , jitterMultiple , multiple )
111
115
for {
112
116
select {
113
117
case <- ctx .Done ():
114
- logger .Info ("TCS session exited cleanly." )
118
+ logger .Info ("ECS Telemetry service ( TCS) session exited cleanly." )
115
119
return nil
116
120
default :
117
121
}
118
122
endpoint , tcsError := discoverPollEndpoint (session .containerInstanceArn , session .ecsClient )
119
123
if tcsError == nil {
124
+ // returning from StartTelemetrySession indicates a disconnection, need to reconnect.
120
125
tcsError = session .s .StartTelemetrySession (ctx , endpoint )
121
126
}
122
127
if tcsError == nil || tcsError == io .EOF {
128
+ // reset backoff when TCS closed for a valid reason, such as connection expiring due to inactivity
123
129
logger .Info ("TCS Websocket connection closed for a valid reason" )
124
130
backoff .Reset ()
125
131
} else {
132
+ // backoff when there is unexpected error, such as invalid frame sent through connection.
126
133
logger .Error ("Error: lost websocket connection with ECS Telemetry service (TCS)" , logger.Fields {
127
134
field .Error : tcsError ,
128
135
})
0 commit comments