-
Notifications
You must be signed in to change notification settings - Fork 471
/
health_controller.go
298 lines (261 loc) · 9.13 KB
/
health_controller.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
package healthcheck
import (
"net/http"
"strconv"
"sync"
"time"
"github.com/cloudnativelabs/kube-router/v2/pkg/options"
"golang.org/x/net/context"
"k8s.io/klog/v2"
)
const (
HPCStaticSyncInterval = 60
HPCSyncPeriod = time.Duration(HPCStaticSyncInterval) * time.Second
defaultGraceTimeDuration = time.Duration(1500) * time.Millisecond
healthControllerTickTime = 5000 * time.Millisecond
// Defined health checks
NetworkRoutesController = iota
LoadBalancerController
NetworkPolicyController
NetworkServicesController
HairpinController
MetricsController
RouteSyncController
)
var (
HeartBeatCompNames = map[int]string{
NetworkRoutesController: "NetworkRoutesController",
LoadBalancerController: "LoadBalancerController",
NetworkPolicyController: "NetworkPolicyController",
NetworkServicesController: "NetworkServicesController",
HairpinController: "HairpinController",
MetricsController: "MetricsController",
RouteSyncController: "RouteSyncController",
}
)
// ControllerHeartbeat is the structure to hold the heartbeats sent by controllers
type ControllerHeartbeat struct {
Component int
LastHeartBeat time.Time
}
// HealthController reports the health of the controller loops as a http endpoint
type HealthController struct {
HealthPort uint16
HTTPEnabled bool
Status HealthStats
Config *options.KubeRouterConfig
}
// HealthStats is holds the latest heartbeats
type HealthStats struct {
sync.Mutex
Healthy bool
LoadBalancerControllerAlive time.Time
LoadBalancerControllerAliveTTL time.Duration
MetricsControllerAlive time.Time
NetworkPolicyControllerAlive time.Time
NetworkPolicyControllerAliveTTL time.Duration
NetworkRoutingControllerAlive time.Time
NetworkRoutingControllerAliveTTL time.Duration
NetworkServicesControllerAlive time.Time
NetworkServicesControllerAliveTTL time.Duration
HairpinControllerAlive time.Time
HairpinControllerAliveTTL time.Duration
RouteSyncControllerAlive time.Time
RouteSyncControllerAliveTTL time.Duration
}
// SendHeartBeat sends a heartbeat on the passed channel
func SendHeartBeat(channel chan<- *ControllerHeartbeat, component int) {
heartbeat := ControllerHeartbeat{
Component: component,
LastHeartBeat: time.Now(),
}
channel <- &heartbeat
}
// Handler writes HTTP responses to the health path
func (hc *HealthController) Handler(w http.ResponseWriter, _ *http.Request) {
if hc.Status.Healthy {
w.WriteHeader(http.StatusOK)
_, err := w.Write([]byte("OK\n"))
if err != nil {
klog.Errorf("Failed to write body: %s", err)
}
} else {
w.WriteHeader(http.StatusInternalServerError)
/*
statusText := fmt.Sprintf("Service controller last alive %s\n ago"+
"Routing controller last alive: %s\n ago"+
"Policy controller last alive: %s\n ago"+
"Metrics controller last alive: %s\n ago",
time.Since(hc.Status.NetworkServicesControllerAlive),
time.Since(hc.Status.NetworkRoutingControllerAlive),
time.Since(hc.Status.NetworkPolicyControllerAlive),
time.Since(hc.Status.MetricsControllerAlive))
w.Write([]byte(statusText))
*/
_, err := w.Write([]byte("Unhealthy"))
if err != nil {
klog.Errorf("Failed to write body: %s", err)
}
}
}
// HandleHeartbeat handles received heartbeats on the health channel
func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) {
klog.V(3).Infof("Received heartbeat from %s", HeartBeatCompNames[beat.Component])
hc.Status.Lock()
defer hc.Status.Unlock()
switch beat.Component {
// The first heartbeat will set the initial gracetime the controller has to report in, A static time is added as
// well when checking to allow for load variation in sync time
case LoadBalancerController:
if hc.Status.LoadBalancerControllerAliveTTL == 0 {
hc.Status.LoadBalancerControllerAliveTTL = time.Since(hc.Status.LoadBalancerControllerAlive)
}
hc.Status.LoadBalancerControllerAlive = beat.LastHeartBeat
case NetworkServicesController:
if hc.Status.NetworkServicesControllerAliveTTL == 0 {
hc.Status.NetworkServicesControllerAliveTTL = time.Since(hc.Status.NetworkServicesControllerAlive)
}
hc.Status.NetworkServicesControllerAlive = beat.LastHeartBeat
case HairpinController:
if hc.Status.HairpinControllerAliveTTL == 0 {
hc.Status.HairpinControllerAliveTTL = time.Since(hc.Status.HairpinControllerAlive)
}
hc.Status.HairpinControllerAlive = beat.LastHeartBeat
case NetworkRoutesController:
if hc.Status.NetworkRoutingControllerAliveTTL == 0 {
hc.Status.NetworkRoutingControllerAliveTTL = time.Since(hc.Status.NetworkRoutingControllerAlive)
}
hc.Status.NetworkRoutingControllerAlive = beat.LastHeartBeat
case RouteSyncController:
if hc.Status.RouteSyncControllerAliveTTL == 0 {
hc.Status.RouteSyncControllerAliveTTL = time.Since(hc.Status.RouteSyncControllerAlive)
}
hc.Status.RouteSyncControllerAlive = beat.LastHeartBeat
case NetworkPolicyController:
if hc.Status.NetworkPolicyControllerAliveTTL == 0 {
hc.Status.NetworkPolicyControllerAliveTTL = time.Since(hc.Status.NetworkPolicyControllerAlive)
}
hc.Status.NetworkPolicyControllerAlive = beat.LastHeartBeat
case MetricsController:
hc.Status.MetricsControllerAlive = beat.LastHeartBeat
}
}
// CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
func (hc *HealthController) CheckHealth() bool {
health := true
graceTime := defaultGraceTimeDuration
if hc.Config.RunFirewall {
if time.Since(hc.Status.NetworkPolicyControllerAlive) >
hc.Config.IPTablesSyncPeriod+hc.Status.NetworkPolicyControllerAliveTTL+graceTime {
klog.Error("Network Policy Controller heartbeat missed")
health = false
}
}
if hc.Config.RunLoadBalancer {
if time.Since(hc.Status.LoadBalancerControllerAlive) >
hc.Config.LoadBalancerSyncPeriod+hc.Status.LoadBalancerControllerAliveTTL+graceTime {
klog.Error("Load Balancer Allocator Controller heartbeat missed")
health = false
}
}
if hc.Config.RunRouter {
if time.Since(hc.Status.NetworkRoutingControllerAlive) >
hc.Config.RoutesSyncPeriod+hc.Status.NetworkRoutingControllerAliveTTL+graceTime {
klog.Error("Network Routing Controller heartbeat missed")
health = false
}
if time.Since(hc.Status.RouteSyncControllerAlive) >
hc.Config.InjectedRoutesSyncPeriod+hc.Status.RouteSyncControllerAliveTTL+graceTime {
klog.Error("Routes Sync Controller heartbeat missed")
health = false
}
}
if hc.Config.RunServiceProxy {
if time.Since(hc.Status.NetworkServicesControllerAlive) >
hc.Config.IpvsSyncPeriod+hc.Status.NetworkServicesControllerAliveTTL+graceTime {
klog.Error("NetworkService Controller heartbeat missed")
health = false
}
// if time.Since(hc.Status.HairpinControllerAlive) >
// HPCSyncPeriod+hc.Status.HairpinControllerAliveTTL+graceTime {
// klog.Error("Hairpin Controller heartbeat missed")
// health = false
// }
}
if hc.Config.MetricsEnabled {
if time.Since(hc.Status.MetricsControllerAlive) > 5*time.Second {
klog.Error("Metrics Controller heartbeat missed")
health = false
}
}
return health
}
// RunServer starts the HealthController's server
func (hc *HealthController) RunServer(stopCh <-chan struct{}, wg *sync.WaitGroup) {
defer wg.Done()
mux := http.NewServeMux()
srv := &http.Server{
Addr: ":" + strconv.Itoa(int(hc.HealthPort)),
Handler: mux,
ReadHeaderTimeout: 5 * time.Second,
}
mux.HandleFunc("/healthz", hc.Handler)
if hc.Config.HealthPort > 0 {
hc.HTTPEnabled = true
go func() {
if err := srv.ListenAndServe(); err != nil {
// cannot panic, because this probably is an intentional close
klog.Errorf("Health controller error: %s", err)
}
}()
} else {
hc.HTTPEnabled = false
}
// block until we receive a shut down signal
<-stopCh
klog.Infof("Shutting down health controller")
if hc.HTTPEnabled {
if err := srv.Shutdown(context.Background()); err != nil {
klog.Errorf("could not shutdown: %v", err)
}
}
}
// RunCheck starts the HealthController's check
func (hc *HealthController) RunCheck(healthChan <-chan *ControllerHeartbeat, stopCh <-chan struct{},
wg *sync.WaitGroup) {
t := time.NewTicker(healthControllerTickTime)
defer wg.Done()
for {
select {
case <-stopCh:
klog.Infof("Shutting down HealthController RunCheck")
return
case heartbeat := <-healthChan:
hc.HandleHeartbeat(heartbeat)
case <-t.C:
klog.V(4).Info("Health controller tick")
}
hc.Status.Healthy = hc.CheckHealth()
}
}
func (hc *HealthController) SetAlive() {
now := time.Now()
hc.Status.LoadBalancerControllerAlive = now
hc.Status.MetricsControllerAlive = now
hc.Status.NetworkPolicyControllerAlive = now
hc.Status.NetworkRoutingControllerAlive = now
hc.Status.NetworkServicesControllerAlive = now
hc.Status.HairpinControllerAlive = now
hc.Status.RouteSyncControllerAlive = now
}
// NewHealthController creates a new health controller and returns a reference to it
func NewHealthController(config *options.KubeRouterConfig) (*HealthController, error) {
hc := HealthController{
Config: config,
HealthPort: config.HealthPort,
Status: HealthStats{
Healthy: true,
},
}
return &hc, nil
}