@@ -19,11 +19,8 @@ package controller
1919
2020import (
2121 "context"
22- "encoding/json"
2322 "fmt"
2423
25- "dario.cat/mergo"
26- corev1 "k8s.io/api/core/v1"
2724 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2825 "k8s.io/client-go/tools/record"
2926 ctrl "sigs.k8s.io/controller-runtime"
@@ -33,9 +30,7 @@ import (
3330 "sigs.k8s.io/controller-runtime/pkg/log"
3431 "sigs.k8s.io/controller-runtime/pkg/predicate"
3532
36- dynamoCommon "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
3733 nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
38- "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
3934 commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
4035 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
4136)
@@ -44,8 +39,6 @@ const (
4439 FailedState = "failed"
4540 ReadyState = "successful"
4641 PendingState = "pending"
47-
48- DYN_DEPLOYMENT_CONFIG_ENV_VAR = "DYN_DEPLOYMENT_CONFIG"
4942)
5043
5144type etcdStorage interface {
@@ -141,37 +134,12 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
141134 }
142135
143136 // merge the dynamoComponentsDeployments with the dynamoComponentsDeployments from the CRD
144- for serviceName , deployment := range dynamoComponentsDeployments {
145- if _ , ok := dynamoDeployment .Spec .Services [serviceName ]; ok {
146- err := mergo .Merge (& deployment .Spec .DynamoComponentDeploymentSharedSpec , dynamoDeployment .Spec .Services [serviceName ].DynamoComponentDeploymentSharedSpec , mergo .WithOverride )
147- if err != nil {
148- logger .Error (err , "failed to merge the DynamoComponentsDeployments" )
149- reason = "failed_to_merge_the_DynamoComponentsDeployments"
150- return ctrl.Result {}, err
151- }
152- }
137+ for _ , deployment := range dynamoComponentsDeployments {
153138 if deployment .Spec .Ingress .Enabled {
154139 dynamoDeployment .SetEndpointStatus (r .isEndpointSecured (), getIngressHost (deployment .Spec .Ingress ))
155140 }
156141 }
157142
158- // Set common env vars on each of the dynamoComponentsDeployments
159- for _ , deployment := range dynamoComponentsDeployments {
160- if len (dynamoDeployment .Spec .Envs ) > 0 {
161- deployment .Spec .Envs = mergeEnvs (dynamoDeployment .Spec .Envs , deployment .Spec .Envs )
162- }
163- err := updateDynDeploymentConfig (deployment , consts .DynamoServicePort )
164- if err != nil {
165- logger .Error (err , fmt .Sprintf ("Failed to update the %v env var" , DYN_DEPLOYMENT_CONFIG_ENV_VAR ))
166- return ctrl.Result {}, err
167- }
168- err = overrideWithDynDeploymentConfig (ctx , deployment )
169- if err != nil {
170- logger .Error (err , fmt .Sprintf ("Failed to override the component config with the %v env var" , DYN_DEPLOYMENT_CONFIG_ENV_VAR ))
171- return ctrl.Result {}, err
172- }
173- }
174-
175143 // reconcile the dynamoComponent
176144 // for now we use the same component for all the services and we differentiate them by the service name when launching the component
177145 dynamoComponent := & nvidiacomv1alpha1.DynamoComponent {
@@ -260,121 +228,6 @@ func (r *DynamoGraphDeploymentReconciler) isEndpointSecured() bool {
260228 return r .IngressControllerTLSSecret != ""
261229}
262230
263- func mergeEnvs (common , specific []corev1.EnvVar ) []corev1.EnvVar {
264- envMap := make (map [string ]corev1.EnvVar )
265-
266- // Add all common environment variables.
267- for _ , env := range common {
268- envMap [env .Name ] = env
269- }
270-
271- // Override or add with service-specific environment variables.
272- for _ , env := range specific {
273- envMap [env .Name ] = env
274- }
275-
276- // Convert the map back to a slice.
277- merged := make ([]corev1.EnvVar , 0 , len (envMap ))
278- for _ , env := range envMap {
279- merged = append (merged , env )
280- }
281- return merged
282- }
283-
284- // updateDynDeploymentConfig updates the DYN_DEPLOYMENT_CONFIG env var for the given dynamoDeploymentComponent
285- // It updates the port for the given service in the DYN_DEPLOYMENT_CONFIG env var (if it is the main component)
286- func updateDynDeploymentConfig (dynamoDeploymentComponent * nvidiacomv1alpha1.DynamoComponentDeployment , newPort int ) error {
287- if dynamoDeploymentComponent .IsMainComponent () {
288- for i , env := range dynamoDeploymentComponent .Spec .Envs {
289- if env .Name == DYN_DEPLOYMENT_CONFIG_ENV_VAR {
290- var config map [string ]any
291- if err := json .Unmarshal ([]byte (env .Value ), & config ); err != nil {
292- return fmt .Errorf ("failed to unmarshal %v: %w" , DYN_DEPLOYMENT_CONFIG_ENV_VAR , err )
293- }
294-
295- // Safely navigate and update the config
296- if serviceConfig , ok := config [dynamoDeploymentComponent .Spec .ServiceName ].(map [string ]any ); ok {
297- if _ , portExists := serviceConfig ["port" ]; portExists {
298- serviceConfig ["port" ] = newPort
299- }
300- }
301-
302- // Marshal back to JSON string
303- updated , err := json .Marshal (config )
304- if err != nil {
305- return fmt .Errorf ("failed to marshal updated config: %w" , err )
306- }
307-
308- // Update env var
309- dynamoDeploymentComponent .Spec .Envs [i ].Value = string (updated )
310- break
311- }
312- }
313- }
314- return nil
315- }
316-
317- func overrideWithDynDeploymentConfig (ctx context.Context , dynamoDeploymentComponent * nvidiacomv1alpha1.DynamoComponentDeployment ) error {
318- for _ , env := range dynamoDeploymentComponent .Spec .Envs {
319- if env .Name == DYN_DEPLOYMENT_CONFIG_ENV_VAR {
320- dynDeploymentConfig , err := dynamo .ParseDynDeploymentConfig (ctx , []byte (env .Value ))
321- if err != nil {
322- return fmt .Errorf ("failed to parse %v: %w" , DYN_DEPLOYMENT_CONFIG_ENV_VAR , err )
323- }
324- componentDynConfig := dynDeploymentConfig [dynamoDeploymentComponent .Spec .ServiceName ]
325- if componentDynConfig != nil {
326- if componentDynConfig .ServiceArgs != nil && componentDynConfig .ServiceArgs .Workers != nil && dynamoDeploymentComponent .Spec .Replicas == nil {
327- // we only override the replicas if it is not set in the CRD.
328- // replicas, if set in the CRD set in the CRD must always be the source of truth.
329- dynamoDeploymentComponent .Spec .Replicas = componentDynConfig .ServiceArgs .Workers
330- }
331- if componentDynConfig .ServiceArgs != nil && componentDynConfig .ServiceArgs .Resources != nil {
332- requests := & dynamoCommon.ResourceItem {}
333- limits := & dynamoCommon.ResourceItem {}
334- if dynamoDeploymentComponent .Spec .Resources == nil {
335- dynamoDeploymentComponent .Spec .Resources = & dynamoCommon.Resources {
336- Requests : requests ,
337- Limits : limits ,
338- }
339- } else {
340- if dynamoDeploymentComponent .Spec .Resources .Requests != nil {
341- requests = dynamoDeploymentComponent .Spec .Resources .Requests
342- } else {
343- dynamoDeploymentComponent .Spec .Resources .Requests = requests
344- }
345- if dynamoDeploymentComponent .Spec .Resources .Limits != nil {
346- limits = dynamoDeploymentComponent .Spec .Resources .Limits
347- } else {
348- dynamoDeploymentComponent .Spec .Resources .Limits = limits
349- }
350- }
351- if componentDynConfig .ServiceArgs .Resources .GPU != nil {
352- requests .GPU = * componentDynConfig .ServiceArgs .Resources .GPU
353- limits .GPU = * componentDynConfig .ServiceArgs .Resources .GPU
354- }
355- if componentDynConfig .ServiceArgs .Resources .CPU != nil {
356- requests .CPU = * componentDynConfig .ServiceArgs .Resources .CPU
357- limits .CPU = * componentDynConfig .ServiceArgs .Resources .CPU
358- }
359- if componentDynConfig .ServiceArgs .Resources .Memory != nil {
360- requests .Memory = * componentDynConfig .ServiceArgs .Resources .Memory
361- limits .Memory = * componentDynConfig .ServiceArgs .Resources .Memory
362- }
363- if componentDynConfig .ServiceArgs .Resources .Custom != nil {
364- requests .Custom = componentDynConfig .ServiceArgs .Resources .Custom
365- limits .Custom = componentDynConfig .ServiceArgs .Resources .Custom
366- }
367- if err := dynamo .SetLwsAnnotations (componentDynConfig .ServiceArgs , dynamoDeploymentComponent ); err != nil {
368- return err
369- }
370- }
371- }
372- break
373- }
374- }
375- return nil
376- }
377-
378231func (r * DynamoGraphDeploymentReconciler ) FinalizeResource (ctx context.Context , dynamoDeployment * nvidiacomv1alpha1.DynamoGraphDeployment ) error {
379232 // for now doing nothing
380233 return nil
0 commit comments