Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
75101b6
fix: wip
mohammedabdulwahhab Aug 11, 2025
1323498
fix: fix
mohammedabdulwahhab Aug 11, 2025
78ca1cc
fix: fix
mohammedabdulwahhab Aug 11, 2025
8514e2a
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into mabdu…
mohammedabdulwahhab Aug 11, 2025
903baf3
fix: refactor main component type to frontend
mohammedabdulwahhab Aug 11, 2025
86759aa
fix: tests partially fixed
mohammedabdulwahhab Aug 12, 2025
045bae6
fix: parameterize component factory with single vs multinode and fix …
mohammedabdulwahhab Aug 12, 2025
bac8c3f
fix: update vllm yamls to use defaults
mohammedabdulwahhab Aug 12, 2025
b7fb92c
fix: update sglang yamls
mohammedabdulwahhab Aug 12, 2025
a618c0c
fix: trtllm yamls
mohammedabdulwahhab Aug 12, 2025
c7d5ad4
fix: add planner component defaults
mohammedabdulwahhab Aug 12, 2025
97b96c5
fix: set planner defaults
mohammedabdulwahhab Aug 12, 2025
286069f
fix: ai lint yaml files
mohammedabdulwahhab Aug 12, 2025
a2f4110
fix: more tee removals
mohammedabdulwahhab Aug 12, 2025
2ebaf90
fix: more lint
mohammedabdulwahhab Aug 12, 2025
3a79d26
Update components/backends/vllm/deploy/disagg_planner.yaml
mohammedabdulwahhab Aug 12, 2025
fe9c153
fix: fix
mohammedabdulwahhab Aug 12, 2025
4cf9394
Merge branch 'mabdulwahhab/defaults' of https://github.com/ai-dynamo/…
mohammedabdulwahhab Aug 12, 2025
880d2b4
fix: fix merge conflicts
mohammedabdulwahhab Aug 12, 2025
b9f9c43
fix: remove backend param
mohammedabdulwahhab Aug 12, 2025
d3eb5d3
Apply suggestions from code review
mohammedabdulwahhab Aug 12, 2025
352a4e7
fix: remove multinode guard and fix tests
mohammedabdulwahhab Aug 12, 2025
04919b7
Merge branch 'mabdulwahhab/defaults' of https://github.com/ai-dynamo/…
mohammedabdulwahhab Aug 12, 2025
1a58890
fix: fix role
mohammedabdulwahhab Aug 12, 2025
e84d253
fix: planner should add a service account
mohammedabdulwahhab Aug 13, 2025
cbd90e9
fix: add startup probe overrides, add checkMainContainerOverrides
mohammedabdulwahhab Aug 13, 2025
042092e
fix: restore prometheus comp in disagg_planner to use componentType f…
mohammedabdulwahhab Aug 13, 2025
0a738a8
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into mabdu…
mohammedabdulwahhab Aug 13, 2025
66dbc51
fix: update prometheus for sglang as well
mohammedabdulwahhab Aug 13, 2025
d5f6b2d
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into mabdu…
mohammedabdulwahhab Aug 13, 2025
1a05dab
fix: remove validate main container
mohammedabdulwahhab Aug 14, 2025
bf8db83
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into mabdu…
mohammedabdulwahhab Aug 14, 2025
e54b451
fix: fix sglang disagg planner
mohammedabdulwahhab Aug 14, 2025
20e84e5
Apply suggestions from code review
mohammedabdulwahhab Aug 14, 2025
117b0ce
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into mabdu…
mohammedabdulwahhab Aug 14, 2025
7aa3627
Merge branch 'mabdulwahhab/defaults' of https://github.com/ai-dynamo/…
mohammedabdulwahhab Aug 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/backends/sglang/deploy/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/sglang/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg-router
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/sglang/deploy/disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-disagg
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/trtllm/deploy/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: trtllm-agg
componentType: main
componentType: frontend
livenessProbe:
exec:
command:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/trtllm/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
timeoutSeconds: 3
failureThreshold: 5
dynamoNamespace: trtllm-agg-router
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/trtllm/deploy/disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: trtllm-disagg
componentType: main
componentType: frontend
livenessProbe:
exec:
command:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/trtllm/deploy/disagg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: trtllm-v1-disagg-router
componentType: main
componentType: frontend
livenessProbe:
exec:
command:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/vllm/deploy/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
timeoutSeconds: 5
failureThreshold: 3
dynamoNamespace: vllm-agg
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/vllm/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: vllm-agg-router
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion components/backends/vllm/deploy/disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: vllm-disagg
componentType: main
componentType: frontend
replicas: 1
livenessProbe:
httpGet:
Expand Down
4 changes: 2 additions & 2 deletions components/backends/vllm/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
services:
Frontend:
dynamoNamespace: vllm-disagg-planner
componentType: main
componentType: frontend
replicas: 1
livenessProbe:
httpGet:
Expand Down Expand Up @@ -101,7 +101,7 @@ spec:
- --profile-results-dir=/workspace/profiling_results
Prometheus:
dynamoNamespace: vllm-disagg-planner
componentType: main
componentType: default
replicas: 1
envs:
- name: PYTHONPATH
Expand Down
2 changes: 1 addition & 1 deletion components/backends/vllm/deploy/disagg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: vllm-v1-disagg-router
componentType: main
componentType: frontend
replicas: 1
livenessProbe:
httpGet:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ func (s *DynamoComponentDeployment) SetSpec(spec any) {
s.Spec = spec.(DynamoComponentDeploymentSpec)
}

func (s *DynamoComponentDeployment) IsMainComponent() bool {
return strings.HasSuffix(s.Spec.DynamoTag, s.Spec.ServiceName) || s.Spec.ComponentType == commonconsts.ComponentTypeMain
func (s *DynamoComponentDeployment) IsFrontendComponent() bool {
return strings.HasSuffix(s.Spec.DynamoTag, s.Spec.ServiceName) || s.Spec.ComponentType == commonconsts.ComponentTypeFrontend
}

func (s *DynamoComponentDeployment) GetDynamoDeploymentConfig() []byte {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestDynamoComponentDeployment_IsMainComponent(t *testing.T) {
func TestDynamoComponentDeployment_IsFrontendComponent(t *testing.T) {
type fields struct {
TypeMeta metav1.TypeMeta
ObjectMeta metav1.ObjectMeta
Expand Down Expand Up @@ -73,8 +73,8 @@ func TestDynamoComponentDeployment_IsMainComponent(t *testing.T) {
Spec: tt.fields.Spec,
Status: tt.fields.Status,
}
if got := s.IsMainComponent(); got != tt.want {
t.Errorf("DynamoComponentDeployment.IsMainComponent() = %v, want %v", got, tt.want)
if got := s.IsFrontendComponent(); got != tt.want {
t.Errorf("DynamoComponentDeployment.IsFrontendComponent() = %v, want %v", got, tt.want)
}
})
}
Expand Down
3 changes: 2 additions & 1 deletion deploy/cloud/operator/internal/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ const (
DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"

ComponentTypePlanner = "planner"
ComponentTypeMain = "main"
ComponentTypeFrontend = "frontend"
ComponentTypeWorker = "worker"
ComponentTypeDefault = "default"
PlannerServiceAccountName = "planner-serviceaccount"

DefaultIngressSuffix = "local"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1341,7 +1341,7 @@ func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResour
},
}

if !opt.dynamoComponentDeployment.IsMainComponent() || (!opt.isGenericService && !opt.containsStealingTrafficDebugModeEnabled) {
if !opt.dynamoComponentDeployment.IsFrontendComponent() || (!opt.isGenericService && !opt.containsStealingTrafficDebugModeEnabled) {
// if it's not the main component or if it's not a generic service and not contains stealing traffic debug mode enabled, we don't need to create the service
return kubeService, true, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
})
resources := []Resource{groveGangSetAsResource}
for componentName, component := range dynamoDeployment.Spec.Services {
if component.ComponentType == consts.ComponentTypeMain {
if component.ComponentType == consts.ComponentTypeFrontend {
// generate the main component service
mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace)
if err != nil {
Expand Down
45 changes: 45 additions & 0 deletions deploy/cloud/operator/internal/dynamo/component_common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

package dynamo

import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
)

// ComponentDefaults interface defines how defaults should be provided
type ComponentDefaults interface {
// GetBaseContainer returns the base container configuration for this component type
// The backendFramework parameter may be empty for components that don't need backend-specific config
GetBaseContainer(backendFramework BackendFramework) (corev1.Container, error)
}

// ComponentDefaultsFactory creates appropriate defaults based on component type
func ComponentDefaultsFactory(componentType string) ComponentDefaults {
switch componentType {
case commonconsts.ComponentTypeFrontend:
return NewFrontendDefaults()
case commonconsts.ComponentTypeWorker:
return NewWorkerDefaults()
default:
return &BaseComponentDefaults{}
}
}

// BaseComponentDefaults provides common defaults shared by all components
type BaseComponentDefaults struct{}

func (b *BaseComponentDefaults) GetBaseContainer(backendFramework BackendFramework) (corev1.Container, error) {
return b.getCommonContainer(), nil
}

func (b *BaseComponentDefaults) getCommonContainer() corev1.Container {
container := corev1.Container{
Name: "main",
}

return container
}
89 changes: 89 additions & 0 deletions deploy/cloud/operator/internal/dynamo/component_frontend.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

package dynamo

import (
"fmt"

commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"
)

// FrontendDefaults implements ComponentDefaults for Frontend components
type FrontendDefaults struct {
*BaseComponentDefaults
}

func NewFrontendDefaults() *FrontendDefaults {
return &FrontendDefaults{&BaseComponentDefaults{}}
}

func (f *FrontendDefaults) GetBaseContainer(backendFramework BackendFramework) (corev1.Container, error) {
// Frontend doesn't need backend-specific config
container := f.getCommonContainer()

// Add HTTP port
container.Ports = []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
}

// Add frontend-specific defaults
container.LivenessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromString(commonconsts.DynamoContainerPortName),
},
},
InitialDelaySeconds: 60,
PeriodSeconds: 60,
TimeoutSeconds: 30,
FailureThreshold: 10,
}

container.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{
"/bin/sh",
"-c",
"curl -s http://localhost:${DYNAMO_PORT}/health | jq -e \".status == \\\"healthy\\\"\"",
},
},
},
InitialDelaySeconds: 60,
PeriodSeconds: 60,
TimeoutSeconds: 30,
FailureThreshold: 10,
}

container.Resources = corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
}

// Add standard environment variables
container.Env = []corev1.EnvVar{
{
Name: commonconsts.EnvDynamoServicePort,
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
}

return container, nil
}
Loading
Loading