ai-dynamo · julienmancuso · Aug 15, 2025 · Aug 11, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/components/backends/sglang/deploy/disagg-multinode.yaml b/components/backends/sglang/deploy/disagg-multinode.yaml
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-disagg-multinode
+spec:
+  envs:
+  - name: HF_TOKEN
+    valueFrom:
+      secretKeyRef:
+        name: hf-token-secret
+        key: HF_TOKEN
+  - name: GLOO_SOCKET_IFNAME
+    value: "eth0"
+  backendFramework: sglang
+  services:
+    Frontend:
+      dynamoNamespace: sglang-disagg-multinode
+      componentType: main
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: my-registry/sglang-runtime:my-tag
+          workingDir: /workspace/components/backends/sglang
+          command: ["sh", "-c"]
+          args:
+            - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg-multinode && python3 -m dynamo.frontend --http-port=8000"
+    decode:
+      numberOfNodes: 2
+      envFromSecret: hf-token-secret
+      dynamoNamespace: sglang-disagg-multinode
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "40Gi"
+        limits:
+          cpu: "10"
+          memory: "40Gi"
+          gpu: "4"
+      extraPodSpec:
+        mainContainer:
+          image: my-registry/sglang-runtime:my-tag
+          workingDir: /workspace/components/backends/sglang
+          command: ["sh", "-c"]
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.decode_worker"
+            - "--model-path"
+            - "meta-llama/Llama-3.3-70B-Instruct"
+            - "--served-model-name"
+            - "meta-llama/Llama-3.3-70B-Instruct"
+            - "--tp-size"
+            - "8"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
+            - "--disaggregation-mode"
+            - "decode"
+            - "--disaggregation-transfer-backend"
+            - "nixl"
+            - "--disaggregation-bootstrap-port"
+            - "30001"
+            - "--mem-fraction-static"
+            - "0.82"
+    prefill:
+      numberOfNodes: 2
+      envFromSecret: hf-token-secret
+      dynamoNamespace: sglang-disagg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "40Gi"
+        limits:
+          cpu: "10"
+          memory: "40Gi"
+          gpu: "4"
+      extraPodSpec:
+        mainContainer:
+          image: my-registry/sglang-runtime:my-tag
+          workingDir: /workspace/components/backends/sglang
+          command: ["sh", "-c"]
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.worker"
+            - "--model-path"
+            - "meta-llama/Llama-3.3-70B-Instruct"
+            - "--served-model-name"
+            - "meta-llama/Llama-3.3-70B-Instruct"
+            - "--tp-size"
+            - "8"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
+            - "--disaggregation-mode"
+            - "prefill"
+            - "--disaggregation-transfer-backend"
+            - "nixl"
+            - "--disaggregation-bootstrap-port"
+            - "30001"
+            - "--mem-fraction-static"
+            - "0.82"
@@ -10009,6 +10009,13 @@ spec:
                       format: int32
                       type: integer
                   type: object
+                numberOfNodes:
+                  default: 1
+                  description: |-
+                    Indicates the number of nodes to deploy for multinode components.
+                    Total number of GPUs is NumberOfNodes * GPU limit.
+                  format: int32
+                  type: integer
                 pvc:
                   description: PVC config describing volumes to be mounted by the component.
                   properties:
@@ -10199,13 +10206,12 @@ spec:
                             type: string
                           type: object
                         gpu:
-                          description: GPU is the number of GPUs to request per node.
+                          description: |-
+                            Indicates the number of GPUs to request.
+                            total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                           type: string
                         memory:
                           type: string
-                        nodes:
-                          description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                          type: string
                       type: object
                     requests:
                       properties:
@@ -10216,13 +10222,12 @@ spec:
                             type: string
                           type: object
                         gpu:
-                          description: GPU is the number of GPUs to request per node.
+                          description: |-
+                            Indicates the number of GPUs to request.
+                            total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                           type: string
                         memory:
                           type: string
-                        nodes:
-                          description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                          type: string
                       type: object
                   type: object
                 serviceName:

@@ -10108,6 +10108,13 @@ spec:
                             format: int32
                             type: integer
                         type: object
+                      numberOfNodes:
+                        default: 1
+                        description: |-
+                          Indicates the number of nodes to deploy for multinode components.
+                          Total number of GPUs is NumberOfNodes * GPU limit.
+                        format: int32
+                        type: integer
                       pvc:
                         description: PVC config describing volumes to be mounted by the component.
                         properties:
@@ -10298,13 +10305,12 @@ spec:
                                   type: string
                                 type: object
                               gpu:
-                                description: GPU is the number of GPUs to request per node.
+                                description: |-
+                                  Indicates the number of GPUs to request.
+                                  total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                                 type: string
                               memory:
                                 type: string
-                              nodes:
-                                description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                                type: string
                             type: object
                           requests:
                             properties:
@@ -10315,13 +10321,12 @@ spec:
                                   type: string
                                 type: object
                               gpu:
-                                description: GPU is the number of GPUs to request per node.
+                                description: |-
+                                  Indicates the number of GPUs to request.
+                                  total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                                 type: string
                               memory:
                                 type: string
-                              nodes:
-                                description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                                type: string
                             type: object
                         type: object
                       serviceName:

@@ -97,7 +97,9 @@ manifests: controller-gen ensure-yq ## Generate WebhookConfiguration, ClusterRol
 			yq eval '.metadata.annotations."helm.sh/resource-policy" = "keep"' -i "$$file"; \
 		fi; \
 	done
-	cp config/crd/bases/*.yaml ../helm/crds/templates/
+	if [ -d "../helm/crds/templates/" ]; then \
+		cp config/crd/bases/*.yaml ../helm/crds/templates/; \
+	fi
 
 .PHONY: generate
 generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.

@@ -25,10 +25,9 @@ import (
 type ResourceItem struct {
 	CPU    string `json:"cpu,omitempty"`
 	Memory string `json:"memory,omitempty"`
-	// GPU is the number of GPUs to request per node.
-	GPU string `json:"gpu,omitempty"`
-	// Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-	Nodes  string            `json:"nodes,omitempty"`
+	// Indicates the number of GPUs to request.
+	// total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
+	GPU    string            `json:"gpu,omitempty"`
 	Custom map[string]string `json:"custom,omitempty"`
 }
 

@@ -106,6 +106,10 @@ type DynamoComponentDeploymentSharedSpec struct {
 	ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"`
 	// Replicas is the desired number of Pods for this component when autoscaling is not used.
 	Replicas *int32 `json:"replicas,omitempty"`
+	// +kubebuilder:default=1
+	// Indicates the number of nodes to deploy for multinode components.
+	// Total number of GPUs is NumberOfNodes * GPU limit.
+	NumberOfNodes *int32 `json:"numberOfNodes,omitempty"`
 }
 
 type IngressTLSSpec struct {
@@ -234,3 +238,18 @@ func (s *DynamoComponentDeployment) SetDynamoDeploymentConfig(config []byte) {
 		Value: string(config),
 	})
 }
+
+func (s *DynamoComponentDeployment) IsMultinode() bool {
+	return s.GetNumberOfNodes() > 1
+}
+
+func (s *DynamoComponentDeployment) GetNumberOfNodes() int32 {
+	return s.Spec.GetNumberOfNodes()
+}
+
+func (s *DynamoComponentDeploymentSharedSpec) GetNumberOfNodes() int32 {
+	if s.NumberOfNodes != nil {
+		return *s.NumberOfNodes
+	}
+	return 1
+}
@@ -10009,6 +10009,13 @@ spec:
                       format: int32
                       type: integer
                   type: object
+                numberOfNodes:
+                  default: 1
+                  description: |-
+                    Indicates the number of nodes to deploy for multinode components.
+                    Total number of GPUs is NumberOfNodes * GPU limit.
+                  format: int32
+                  type: integer
                 pvc:
                   description: PVC config describing volumes to be mounted by the component.
                   properties:
@@ -10199,13 +10206,12 @@ spec:
                             type: string
                           type: object
                         gpu:
-                          description: GPU is the number of GPUs to request per node.
+                          description: |-
+                            Indicates the number of GPUs to request.
+                            total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                           type: string
                         memory:
                           type: string
-                        nodes:
-                          description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                          type: string
                       type: object
                     requests:
                       properties:
@@ -10216,13 +10222,12 @@ spec:
                             type: string
                           type: object
                         gpu:
-                          description: GPU is the number of GPUs to request per node.
+                          description: |-
+                            Indicates the number of GPUs to request.
+                            total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                           type: string
                         memory:
                           type: string
-                        nodes:
-                          description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                          type: string
                       type: object
                   type: object
                 serviceName:

@@ -10108,6 +10108,13 @@ spec:
                             format: int32
                             type: integer
                         type: object
+                      numberOfNodes:
+                        default: 1
+                        description: |-
+                          Indicates the number of nodes to deploy for multinode components.
+                          Total number of GPUs is NumberOfNodes * GPU limit.
+                        format: int32
+                        type: integer
                       pvc:
                         description: PVC config describing volumes to be mounted by the component.
                         properties:
@@ -10298,13 +10305,12 @@ spec:
                                   type: string
                                 type: object
                               gpu:
-                                description: GPU is the number of GPUs to request per node.
+                                description: |-
+                                  Indicates the number of GPUs to request.
+                                  total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                                 type: string
                               memory:
                                 type: string
-                              nodes:
-                                description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                                type: string
                             type: object
                           requests:
                             properties:
@@ -10315,13 +10321,12 @@ spec:
                                   type: string
                                 type: object
                               gpu:
-                                description: GPU is the number of GPUs to request per node.
+                                description: |-
+                                  Indicates the number of GPUs to request.
+                                  total number of GPUs is NumberOfNodes * GPU in case of multinode deployment.
                                 type: string
                               memory:
                                 type: string
-                              nodes:
-                                description: Nodes is the number of nodes to request. Total number of GPUs will be GPU * Nodes.
-                                type: string
                             type: object
                         type: object
                       serviceName: