Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions components/backends/sglang/deploy/disagg-multinode.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-disagg-multinode
spec:
envs:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: GLOO_SOCKET_IFNAME
value: "eth0"
backendFramework: sglang
services:
Frontend:
dynamoNamespace: sglang-disagg-multinode
componentType: main
replicas: 1
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command: ["sh", "-c"]
args:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg-multinode && python3 -m dynamo.frontend --http-port=8000"
decode:
numberOfNodes: 2
envFromSecret: hf-token-secret
dynamoNamespace: sglang-disagg-multinode
componentType: worker
replicas: 1
resources:
requests:
cpu: "10"
memory: "40Gi"
limits:
cpu: "10"
memory: "40Gi"
gpu: "4"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command: ["sh", "-c"]
args:
- "python3"
- "-m"
- "dynamo.sglang.decode_worker"
- "--model-path"
- "meta-llama/Llama-3.3-70B-Instruct"
- "--served-model-name"
- "meta-llama/Llama-3.3-70B-Instruct"
- "--tp-size"
- "8"
- "--trust-remote-code"
- "--skip-tokenizer-init"
- "--disaggregation-mode"
- "decode"
- "--disaggregation-transfer-backend"
- "nixl"
- "--disaggregation-bootstrap-port"
- "30001"
- "--mem-fraction-static"
- "0.82"
prefill:
numberOfNodes: 2
envFromSecret: hf-token-secret
dynamoNamespace: sglang-disagg
componentType: worker
replicas: 1
resources:
requests:
cpu: "10"
memory: "40Gi"
limits:
cpu: "10"
memory: "40Gi"
gpu: "4"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command: ["sh", "-c"]
args:
- "python3"
- "-m"
- "dynamo.sglang.worker"
- "--model-path"
- "meta-llama/Llama-3.3-70B-Instruct"
- "--served-model-name"
- "meta-llama/Llama-3.3-70B-Instruct"
- "--tp-size"
- "8"
- "--trust-remote-code"
- "--skip-tokenizer-init"
- "--disaggregation-mode"
- "prefill"
- "--disaggregation-transfer-backend"
- "nixl"
- "--disaggregation-bootstrap-port"
- "30001"
- "--mem-fraction-static"
- "0.82"
Original file line number Diff line number Diff line change
Expand Up @@ -10009,6 +10009,13 @@ spec:
format: int32
type: integer
type: object
numberOfNodes:
default: 1
description: |-
Indicates the number of nodes to deploy for multinode components.
Total number of GPUs is NumberOfNodes * GPU limit.
format: int32
type: integer
pvc:
description: PVC config describing volumes to be mounted by the component.
properties:
Expand Down Expand Up @@ -10199,13 +10206,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
requests:
properties:
Expand All @@ -10216,13 +10222,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
type: object
serviceName:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10108,6 +10108,13 @@ spec:
format: int32
type: integer
type: object
numberOfNodes:
default: 1
description: |-
Indicates the number of nodes to deploy for multinode components.
Total number of GPUs is NumberOfNodes * GPU limit.
format: int32
type: integer
pvc:
description: PVC config describing volumes to be mounted by the component.
properties:
Expand Down Expand Up @@ -10298,13 +10305,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
requests:
properties:
Expand All @@ -10315,13 +10321,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
type: object
serviceName:
Expand Down
4 changes: 3 additions & 1 deletion deploy/cloud/operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ manifests: controller-gen ensure-yq ## Generate WebhookConfiguration, ClusterRol
yq eval '.metadata.annotations."helm.sh/resource-policy" = "keep"' -i "$$file"; \
fi; \
done
cp config/crd/bases/*.yaml ../helm/crds/templates/
if [ -d "../helm/crds/templates/" ]; then \
cp config/crd/bases/*.yaml ../helm/crds/templates/; \
fi

.PHONY: generate
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
Expand Down
7 changes: 3 additions & 4 deletions deploy/cloud/operator/api/dynamo/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,9 @@ import (
type ResourceItem struct {
CPU string `json:"cpu,omitempty"`
Memory string `json:"memory,omitempty"`
// GPU is the number of GPUs to request per node.
GPU string `json:"gpu,omitempty"`
// Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
Nodes string `json:"nodes,omitempty"`
// Indicates the number of GPUs to request.
// total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
GPU string `json:"gpu,omitempty"`
Custom map[string]string `json:"custom,omitempty"`
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ type DynamoComponentDeploymentSharedSpec struct {
ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"`
// Replicas is the desired number of Pods for this component when autoscaling is not used.
Replicas *int32 `json:"replicas,omitempty"`
// +kubebuilder:default=1
// Indicates the number of nodes to deploy for multinode components.
// Total number of GPUs is NumberOfNodes * GPU limit.
NumberOfNodes *int32 `json:"numberOfNodes,omitempty"`
}

type IngressTLSSpec struct {
Expand Down Expand Up @@ -234,3 +238,18 @@ func (s *DynamoComponentDeployment) SetDynamoDeploymentConfig(config []byte) {
Value: string(config),
})
}

func (s *DynamoComponentDeployment) IsMultinode() bool {
return s.GetNumberOfNodes() > 1
}

func (s *DynamoComponentDeployment) GetNumberOfNodes() int32 {
return s.Spec.GetNumberOfNodes()
}

func (s *DynamoComponentDeploymentSharedSpec) GetNumberOfNodes() int32 {
if s.NumberOfNodes != nil {
return *s.NumberOfNodes
}
return 1
}
5 changes: 5 additions & 0 deletions deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -10009,6 +10009,13 @@ spec:
format: int32
type: integer
type: object
numberOfNodes:
default: 1
description: |-
Indicates the number of nodes to deploy for multinode components.
Total number of GPUs is NumberOfNodes * GPU limit.
format: int32
type: integer
pvc:
description: PVC config describing volumes to be mounted by the component.
properties:
Expand Down Expand Up @@ -10199,13 +10206,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
requests:
properties:
Expand All @@ -10216,13 +10222,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
type: object
serviceName:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10108,6 +10108,13 @@ spec:
format: int32
type: integer
type: object
numberOfNodes:
default: 1
description: |-
Indicates the number of nodes to deploy for multinode components.
Total number of GPUs is NumberOfNodes * GPU limit.
format: int32
type: integer
pvc:
description: PVC config describing volumes to be mounted by the component.
properties:
Expand Down Expand Up @@ -10298,13 +10305,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
requests:
properties:
Expand All @@ -10315,13 +10321,12 @@ spec:
type: string
type: object
gpu:
description: GPU is the number of GPUs to request per node.
description: |-
Indicates the number of GPUs to request.
total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
type: string
memory:
type: string
nodes:
description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
type: string
type: object
type: object
serviceName:
Expand Down
Loading
Loading