ai-dynamo · athreesh · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/.cursor/rules/docs-rules.mdc b/.cursor/rules/docs-rules.mdc
@@ -0,0 +1,48 @@
+---
+globs: *.md
+alwaysApply: false
+---
+Spell out numbers under 10 unless you are talking about a specific value or a parameter.
+Correct: This uses four nodes.
+Incorrect: This uses 4 nodes
+Avoid using "please" and "see."
+Please is used in marketing documents rather than technical documentation.
+"Refer to" is more sensitive to users with visual disabilities than "see."
+Refer to the following rather than see the following.
+Use "can" instead of "may" whenever possible.
+Use: The installation can take up to 20 minutes.
+Avoid: The installation may take up to 20 minutes.
+For parenthetical statements, ensure your punctuation is in the correct location when using quotation marks.
+Correct: A key and familiar paradigm in Biology exists that "Structure determines Function."
+Incorrect: A key and familiar paradigm in Biology exists that "Structure determines Function". (I know this looks odd to most people educated outside of the U.S.)
+Avoid using Latin terms (e.g., etc., i.e., and via).
+e.g. = for example, etc. = and so on, i.e. = that is, and via = using, by, or through.
+Avoid excessive hyphenation.
+For example, if using command line as a noun, you don't need a hyphen, but you need one when it's an adjective.
+Example: The command line is very easy to use, but command-line output can be hard to understand.
+Ensure commas are used correctly.
+Use a comma before the conjunction in a list of three or more items. (The comma that comes before the conjunction is known as the Oxford or serial comma.)
+Google includes Mail, Calendar, People, and Tasks.
+Save your file to a hard drive, an external drive, or OneDrive.
+               Note: If a series contains more than three items or the items are long, consider a bulleted list to improve readability.
+Commas also follow an introductory phrase.
+With WhatsApp, you can call any phone.
+To join independent clauses with a conjunction, such as and, or, but, yet or so.
+This tool is used to parse incoming parameters, and it can also be used to parse the outgoing results.
+               Note: If the sentence is long or complex, consider rewriting it as two sentences.
+Case headings consistently throughout the document (we currently use title casing).
+NVIDIA is not cased "Nvidia" (when referring to the company or products). You will see this casing in marketing docs and blogs, but it shouldn't be used in our tech docs.
+Use “after” instead of “once” because they have a subtle difference.
+What Is Their Main Difference?
+The main difference between "once" and "after" is that "once" expresses a sense of urgency while "after" simply points out what follows an action or event.
+     Example: 
+     Call me once you get home. Means as soon as you step in the door.
+     Call me after you get home. Means call me sometime after you arrive. 
+Once is also meant to illustrate something is done one time only.
+Make sure you use "which" and "that”  (relative pronouns) correctly.
+Relative Pronouns
+What are they?
+
+Relative pronouns such as "which" and "that" are words that begin adjective clauses. Often, relative pronouns can be removed from a sentence without changing its meaning. In fact, at times, the omission of these clauses results in cleaner, tighter sentences. If required for clarity, it is important to know which one to use and when.
+
+Clauses can be essential because they can provide the reader with more information about the noun they follow. Clauses that start with "which" are nonrestrictive or nonessential clauses. A nonrestrictive/nonessential clause is a clause that does not limit the essential meaning of the element it modifies, and if it were removed from the sentence, the implied meaning behind it would remain the same. Nonrestrictive/nonessential clauses should always be set off from the rest of the sentence with commas.
diff --git a/components/README.md b/components/README.md
@@ -29,7 +29,7 @@ Dynamo supports multiple inference engines (with a focus on SGLang, vLLM, and Te
 
 Each engine provides launch scripts for different deployment patterns in their respective `/launch` & `/deploy` directories.
 
-## Core Components
+## Core Services
 
 ### [Backends](backends/)
 

diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -50,7 +50,7 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
 | **GB200 Support**   | ✅     |                                                              |
 
 
-## Quick Start
+## SGLang Quick Start
 
 Below we provide a guide that lets you run all of our the common deployment patterns on a single node. See our different [architectures](../llm/README.md#deployment-architectures) for a high level overview of each pattern and the architecture diagram for each.
 

diff --git a/components/backends/sglang/deploy/agg.yaml.bak b/components/backends/sglang/deploy/agg.yaml.bak
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-agg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: sglang-agg
+      componentType: frontend
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "10Gi"
+        limits:
+          cpu: "32"
+          memory: "40Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.4.0
+          workingDir: /workspace/components/backends/sglang
+          command: ["sh", "-c"]
+          args:
+            - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg && python3 -m dynamo.frontend --http-port=8000"
+    SGLangDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: sglang-agg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "32"
+          memory: "80Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.4.0
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.worker"
+            - "--model-path"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--served-model-name"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--page-size"
+            - "16"
+            - "--tp"
+            - "1"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
diff --git a/components/backends/sglang/deploy/agg_router.yaml.bak b/components/backends/sglang/deploy/agg_router.yaml.bak
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-agg-router
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: sglang-agg-router
+      componentType: frontend
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "10Gi"
+        limits:
+          cpu: "32"
+          memory: "40Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.4.0
+          workingDir: /workspace/components/backends/sglang
+          command: ["sh", "-c"]
+          args:
+            - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg-router && python3 -m dynamo.frontend --http-port=8000  --router-mode kv"
+    SGLangDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: sglang-agg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "32"
+          memory: "80Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.4.0
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.worker"
+            - "--model-path"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--served-model-name"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--page-size"
+            - "16"
+            - "--tp"
+            - "1"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
diff --git a/components/backends/sglang/deploy/disagg.yaml.bak b/components/backends/sglang/deploy/disagg.yaml.bak
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-disagg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: sglang-disagg
+      componentType: frontend
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "10Gi"
+        limits:
+          cpu: "32"
+          memory: "40Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.4.0
+          workingDir: /workspace/components/backends/sglang
+          command: ["sh", "-c"]
+          args:
+            - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000"
+    SGLangDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: sglang-disagg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "32"
+          memory: "80Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.4.0
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.decode_worker"
+            - "--model-path"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--served-model-name"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--page-size"
+            - "16"
+            - "--tp"
+            - "1"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
+            - "--disaggregation-mode"
+            - "decode"
+            - "--disaggregation-transfer-backend"
+            - "nixl"
+    SGLangPrefillWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: sglang-disagg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "32"
+          memory: "80Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.4.0
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.worker"
+            - "--model-path"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--served-model-name"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--page-size"
+            - "16"
+            - "--tp"
+            - "1"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
+            - "--disaggregation-mode"
+            - "prefill"
+            - "--disaggregation-transfer-backend"
+            - "nixl"