Skip to content

Commit f0e0d92

Browse files
authored
feat(hatchery:k8s): worker config as secret (#6105)
1 parent 2876a3c commit f0e0d92

22 files changed

+332
-175
lines changed

engine/hatchery/kubernetes/helper_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ func NewHatcheryKubernetesTest(t *testing.T) *HatcheryKubernetes {
2323
h.kubeClient = &kubernetesClient{clientSet}
2424
gock.InterceptClient(clientSet.CoreV1().RESTClient().(*rest.RESTClient).Client)
2525

26-
h.Config.Name = "kyubi"
27-
h.Config.Namespace = "hachibi"
26+
h.Config.Name = "my-hatchery"
27+
h.Config.Namespace = "cds-workers"
2828
h.ServiceInstance = &sdk.Service{
2929
CanonicalService: sdk.CanonicalService{
3030
ID: 1,
31-
Name: "kyubi",
31+
Name: "my-hatchery",
3232
},
3333
}
3434
return h

engine/hatchery/kubernetes/kill_workers.go

+34-6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package kubernetes
22

33
import (
44
"context"
5+
"fmt"
56
"strconv"
67
"strings"
78
"time"
@@ -17,20 +18,47 @@ import (
1718
)
1819

1920
func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
20-
pods, err := h.kubeClient.PodList(ctx, h.Config.Namespace, metav1.ListOptions{LabelSelector: LABEL_WORKER})
21+
pods, err := h.kubeClient.PodList(ctx, h.Config.Namespace, metav1.ListOptions{
22+
LabelSelector: fmt.Sprintf("%s=%s,%s", LABEL_HATCHERY_NAME, h.Config.Name, LABEL_WORKER_NAME),
23+
})
2124
if err != nil {
2225
return err
2326
}
27+
28+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
29+
defer cancel()
30+
workers, err := h.CDSClient().WorkerList(ctx)
31+
if err != nil {
32+
return err
33+
}
34+
2435
var globalErr error
2536
for _, pod := range pods.Items {
2637
annotations := pod.GetAnnotations()
2738
labels := pod.GetLabels()
28-
toDelete := false
29-
for _, container := range pod.Status.ContainerStatuses {
39+
if labels == nil {
40+
continue
41+
}
3042

31-
if (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error")) ||
32-
(container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull") {
33-
toDelete = true
43+
var toDelete, found bool
44+
for _, w := range workers {
45+
if workerName, ok := labels[LABEL_WORKER_NAME]; ok && workerName == w.Name {
46+
found = true
47+
break
48+
}
49+
}
50+
if !found {
51+
toDelete = true
52+
}
53+
54+
if !toDelete {
55+
for _, container := range pod.Status.ContainerStatuses {
56+
terminated := (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error"))
57+
errImagePull := (container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull")
58+
if terminated || errImagePull {
59+
toDelete = true
60+
break
61+
}
3462
}
3563
}
3664

engine/hatchery/kubernetes/kill_workers_test.go

+45-12
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"gopkg.in/h2non/gock.v1"
1010
v1 "k8s.io/api/core/v1"
1111
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
13+
"github.com/ovh/cds/sdk"
1214
)
1315

1416
func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) {
@@ -19,8 +21,12 @@ func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) {
1921
Items: []v1.Pod{
2022
{
2123
ObjectMeta: metav1.ObjectMeta{
22-
Name: "w1",
23-
Namespace: "kyubi",
24+
Name: "worker-1",
25+
Namespace: "cds-workers",
26+
Labels: map[string]string{
27+
LABEL_HATCHERY_NAME: "my-hatchery",
28+
LABEL_WORKER_NAME: "worker-1",
29+
},
2430
},
2531
Status: v1.PodStatus{
2632
ContainerStatuses: []v1.ContainerStatus{
@@ -36,15 +42,23 @@ func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) {
3642
},
3743
{
3844
ObjectMeta: metav1.ObjectMeta{
39-
Name: "wrong",
40-
Namespace: "kyubi",
45+
Name: "worker-2",
46+
Namespace: "cds-workers",
47+
Labels: map[string]string{
48+
LABEL_HATCHERY_NAME: "my-hatchery",
49+
LABEL_WORKER_NAME: "worker-2",
50+
},
4151
},
4252
Spec: v1.PodSpec{},
4353
},
4454
{
4555
ObjectMeta: metav1.ObjectMeta{
46-
Name: "w2",
47-
Namespace: "kyubi",
56+
Name: "worker-3",
57+
Namespace: "cds-workers",
58+
Labels: map[string]string{
59+
LABEL_HATCHERY_NAME: "my-hatchery",
60+
LABEL_WORKER_NAME: "worker-3",
61+
},
4862
},
4963
Status: v1.PodStatus{
5064
ContainerStatuses: []v1.ContainerStatus{
@@ -60,8 +74,12 @@ func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) {
6074
},
6175
{
6276
ObjectMeta: metav1.ObjectMeta{
63-
Name: "w3",
64-
Namespace: "kyubi",
77+
Name: "worker-4",
78+
Namespace: "cds-workers",
79+
Labels: map[string]string{
80+
LABEL_HATCHERY_NAME: "my-hatchery",
81+
LABEL_WORKER_NAME: "worker-4",
82+
},
6583
},
6684
Status: v1.PodStatus{
6785
ContainerStatuses: []v1.ContainerStatus{
@@ -75,13 +93,28 @@ func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) {
7593
},
7694
},
7795
},
96+
{
97+
ObjectMeta: metav1.ObjectMeta{
98+
Name: "worker-5",
99+
Namespace: "cds-workers",
100+
Labels: map[string]string{
101+
LABEL_HATCHERY_NAME: "my-hatchery",
102+
LABEL_WORKER_NAME: "worker-5",
103+
},
104+
},
105+
},
78106
},
79107
}
80-
gock.New("http://lolcat.kube").Get("/api/v1/namespaces/hachibi/pods").Reply(http.StatusOK).JSON(podsList)
108+
gock.New("http://lolcat.kube").Get("/api/v1/namespaces/cds-workers/pods").Reply(http.StatusOK).JSON(podsList)
109+
110+
gock.New("http://lolcat.kube").Delete("/api/v1/namespaces/cds-workers/pods/worker-1").Reply(http.StatusOK).JSON(nil)
111+
gock.New("http://lolcat.kube").Delete("/api/v1/namespaces/cds-workers/pods/worker-2").Reply(http.StatusOK).JSON(nil)
112+
gock.New("http://lolcat.kube").Delete("/api/v1/namespaces/cds-workers/pods/worker-3").Reply(http.StatusOK).JSON(nil)
113+
gock.New("http://lolcat.kube").Delete("/api/v1/namespaces/cds-workers/pods/worker-4").Reply(http.StatusOK).JSON(nil)
81114

82-
gock.New("http://lolcat.kube").Delete("/api/v1/namespaces/kyubi/pods/w1").Reply(http.StatusOK).JSON(nil)
83-
gock.New("http://lolcat.kube").Delete("/api/v1/namespaces/kyubi/pods/w2").Reply(http.StatusOK).JSON(nil)
84-
gock.New("http://lolcat.kube").Delete("/api/v1/namespaces/kyubi/pods/w3").Reply(http.StatusOK).JSON(nil)
115+
gock.New("http://lolcat.api").Get("/worker").Reply(http.StatusOK).JSON([]sdk.Worker{{
116+
Name: "worker-5",
117+
}})
85118

86119
err := h.killAwolWorkers(context.TODO())
87120
require.NoError(t, err)

engine/hatchery/kubernetes/kubernetes.go

+51-36
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/ovh/cds/sdk"
2525
"github.com/ovh/cds/sdk/cdsclient"
2626
"github.com/ovh/cds/sdk/hatchery"
27+
"github.com/ovh/cds/sdk/slug"
2728
)
2829

2930
// New instanciates a new hatchery local
@@ -99,8 +100,12 @@ func (h *HatcheryKubernetes) ApplyConfiguration(cfg interface{}) error {
99100
// Status returns sdk.MonitoringStatus, implements interface service.Service
100101
func (h *HatcheryKubernetes) Status(ctx context.Context) *sdk.MonitoringStatus {
101102
m := h.NewMonitoringStatus()
102-
m.AddLine(sdk.MonitoringStatusLine{Component: "Workers", Value: fmt.Sprintf("%d/%d", len(h.WorkersStarted(ctx)), h.Config.Provision.MaxWorker), Status: sdk.MonitoringStatusOK})
103-
103+
ws, err := h.WorkersStarted(ctx)
104+
if err != nil {
105+
ctx = log.ContextWithStackTrace(ctx, err)
106+
log.Warn(ctx, err.Error())
107+
}
108+
m.AddLine(sdk.MonitoringStatusLine{Component: "Workers", Value: fmt.Sprintf("%d/%d", len(ws), h.Config.Provision.MaxWorker), Status: sdk.MonitoringStatusOK})
104109
return m
105110
}
106111

@@ -171,11 +176,6 @@ func (h *HatcheryKubernetes) SpawnWorker(ctx context.Context, spawnArgs hatchery
171176
return sdk.WithStack(fmt.Errorf("no job ID and no register"))
172177
}
173178

174-
label := "execution"
175-
if spawnArgs.RegisterOnly {
176-
label = "register"
177-
}
178-
179179
var logJob string
180180
if spawnArgs.JobID > 0 {
181181
logJob = fmt.Sprintf("for workflow job %d,", spawnArgs.JobID)
@@ -221,7 +221,6 @@ func (h *HatcheryKubernetes) SpawnWorker(ctx context.Context, spawnArgs hatchery
221221
envsWm := workerConfig.InjectEnvVars
222222
envsWm["CDS_MODEL_MEMORY"] = fmt.Sprintf("%d", memory)
223223
envsWm["CDS_FROM_WORKER_IMAGE"] = "true"
224-
envsWm["CDS_CONFIG"] = workerConfig.EncodeBase64()
225224

226225
for envName, envValue := range spawnArgs.Model.ModelDocker.Envs {
227226
envsWm[envName] = envValue
@@ -239,16 +238,33 @@ func (h *HatcheryKubernetes) SpawnWorker(ctx context.Context, spawnArgs hatchery
239238
pullPolicy = "Always"
240239
}
241240

241+
// Create secret for worker config
242+
configSecretName, err := h.createConfigSecret(ctx, workerConfig)
243+
if err != nil {
244+
return sdk.WrapError(err, "cannot create secret for config %s", workerConfig.Name)
245+
}
246+
envs = append(envs, apiv1.EnvVar{
247+
Name: "CDS_CONFIG",
248+
ValueFrom: &apiv1.EnvVarSource{
249+
SecretKeyRef: &apiv1.SecretKeySelector{
250+
LocalObjectReference: apiv1.LocalObjectReference{
251+
Name: configSecretName,
252+
},
253+
Key: "CDS_CONFIG",
254+
},
255+
},
256+
})
257+
242258
var gracePeriodSecs int64
243259
podSchema := apiv1.Pod{
244260
ObjectMeta: metav1.ObjectMeta{
245261
Name: spawnArgs.WorkerName,
246262
Namespace: h.Config.Namespace,
247263
DeletionGracePeriodSeconds: &gracePeriodSecs,
248264
Labels: map[string]string{
249-
LABEL_WORKER: label,
250-
LABEL_WORKER_MODEL: strings.ToLower(spawnArgs.Model.Name),
251-
LABEL_HATCHERY_NAME: h.Configuration().Name,
265+
LABEL_HATCHERY_NAME: h.Configuration().Name,
266+
LABEL_WORKER_NAME: workerConfig.Name,
267+
LABEL_WORKER_MODEL_PATH: slug.Convert(spawnArgs.Model.Path()),
252268
},
253269
Annotations: map[string]string{},
254270
},
@@ -273,6 +289,15 @@ func (h *HatcheryKubernetes) SpawnWorker(ctx context.Context, spawnArgs hatchery
273289
},
274290
}
275291

292+
// Check here to add secret if needed
293+
if spawnArgs.Model.ModelDocker.Private {
294+
secretRegistryName, err := h.createRegistrySecret(ctx, *spawnArgs.Model)
295+
if err != nil {
296+
return sdk.WrapError(err, "cannot create secret for model %s", spawnArgs.Model.Path())
297+
}
298+
podSchema.Spec.ImagePullSecrets = []apiv1.LocalObjectReference{{Name: secretRegistryName}}
299+
}
300+
276301
var services []sdk.Requirement
277302
for _, req := range spawnArgs.Requirements {
278303
if req.Type == sdk.ServiceRequirement {
@@ -286,16 +311,6 @@ func (h *HatcheryKubernetes) SpawnWorker(ctx context.Context, spawnArgs hatchery
286311
podSchema.Spec.HostAliases[0].Hostnames[0] = "worker"
287312
}
288313

289-
// Check here to add secret if needed
290-
secretName := "cds-credreg-" + spawnArgs.Model.Name
291-
if spawnArgs.Model.ModelDocker.Private {
292-
if err := h.createSecret(ctx, secretName, *spawnArgs.Model); err != nil {
293-
return sdk.WrapError(err, "cannot create secret for model %s", spawnArgs.Model.Path())
294-
}
295-
podSchema.Spec.ImagePullSecrets = []apiv1.LocalObjectReference{{Name: secretName}}
296-
podSchema.ObjectMeta.Labels[LABEL_SECRET] = secretName
297-
}
298-
299314
for i, serv := range services {
300315
//name= <alias> => the name of the host put in /etc/hosts of the worker
301316
//value= "postgres:latest env_1=blabla env_2=blabla"" => we can add env variables in requirement name
@@ -345,7 +360,7 @@ func (h *HatcheryKubernetes) SpawnWorker(ctx context.Context, spawnArgs hatchery
345360
podSchema.Spec.HostAliases[0].Hostnames[i+1] = strings.ToLower(serv.Name)
346361
}
347362

348-
_, err := h.kubeClient.PodCreate(ctx, h.Config.Namespace, &podSchema, metav1.CreateOptions{})
363+
_, err = h.kubeClient.PodCreate(ctx, h.Config.Namespace, &podSchema, metav1.CreateOptions{})
349364
log.Debug(ctx, "hatchery> kubernetes> SpawnWorker> %s > Pod created", spawnArgs.WorkerName)
350365
return sdk.WithStack(err)
351366
}
@@ -356,20 +371,18 @@ func (h *HatcheryKubernetes) GetLogger() *logrus.Logger {
356371

357372
// WorkersStarted returns the number of instances started but
358373
// not necessarily register on CDS yet
359-
func (h *HatcheryKubernetes) WorkersStarted(ctx context.Context) []string {
360-
list, err := h.kubeClient.PodList(ctx, h.Config.Namespace, metav1.ListOptions{LabelSelector: LABEL_HATCHERY_NAME})
374+
func (h *HatcheryKubernetes) WorkersStarted(ctx context.Context) ([]string, error) {
375+
list, err := h.kubeClient.PodList(ctx, h.Config.Namespace, metav1.ListOptions{
376+
LabelSelector: fmt.Sprintf("%s=%s,%s", LABEL_HATCHERY_NAME, h.Config.Name, LABEL_WORKER_NAME),
377+
})
361378
if err != nil {
362-
log.Warn(ctx, "WorkersStarted> unable to list pods on namespace %s", h.Config.Namespace)
363-
return nil
379+
return nil, sdk.WrapError(err, "unable to list pods on namespace %s", h.Config.Namespace)
364380
}
365381
workerNames := make([]string, 0, list.Size())
366382
for _, pod := range list.Items {
367-
labels := pod.GetLabels()
368-
if labels[LABEL_HATCHERY_NAME] == h.Configuration().Name {
369-
workerNames = append(workerNames, pod.GetName())
370-
}
383+
workerNames = append(workerNames, pod.GetName())
371384
}
372-
return workerNames
385+
return workerNames, nil
373386
}
374387

375388
// NeedRegistration return true if worker model need regsitration
@@ -381,31 +394,33 @@ func (h *HatcheryKubernetes) NeedRegistration(_ context.Context, m *sdk.Model) b
381394
}
382395

383396
func (h *HatcheryKubernetes) routines(ctx context.Context) {
384-
ticker := time.NewTicker(10 * time.Minute)
397+
ticker := time.NewTicker(10 * time.Second)
385398
defer ticker.Stop()
386399

387400
for {
388401
select {
389402
case <-ticker.C:
390403
h.GoRoutines.Exec(ctx, "getCDNConfiguration", func(ctx context.Context) {
391404
if err := h.Common.RefreshServiceLogger(ctx); err != nil {
392-
log.Error(ctx, "hatchery> kubernetes> cannot get cdn configuration : %v", err)
405+
log.ErrorWithStackTrace(ctx, sdk.WrapError(err, "cannot get cdn configuration"))
393406
}
394407
})
395408

396409
h.GoRoutines.Exec(ctx, "getServicesLogs", func(ctx context.Context) {
397410
if err := h.getServicesLogs(ctx); err != nil {
398-
log.Error(ctx, "Hatchery> Kubernetes> Cannot get service logs : %v", err)
411+
log.ErrorWithStackTrace(ctx, sdk.WrapError(err, "cannot get service logs"))
399412
}
400413
})
401414

402415
h.GoRoutines.Exec(ctx, "killAwolWorker", func(ctx context.Context) {
403-
_ = h.killAwolWorkers(ctx)
416+
if err := h.killAwolWorkers(ctx); err != nil {
417+
log.ErrorWithStackTrace(ctx, sdk.WrapError(err, "cannot delete awol worker"))
418+
}
404419
})
405420

406421
h.GoRoutines.Exec(ctx, "deleteSecrets", func(ctx context.Context) {
407422
if err := h.deleteSecrets(ctx); err != nil {
408-
log.Error(ctx, "hatchery> kubernetes> cannot handle secrets : %v", err)
423+
log.ErrorWithStackTrace(ctx, sdk.WrapError(err, "cannot delete secrets"))
409424
}
410425
})
411426
case <-ctx.Done():

0 commit comments

Comments
 (0)