ai-dynamo · tedzhouhk · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -88,14 +88,14 @@ docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.3.2
 ### Aggregated Serving
 
 ```bash
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/agg.sh
 ```
 
 ### Aggregated Serving with KV Routing
 
 ```bash
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/agg_router.sh
 ```
 
@@ -119,7 +119,7 @@ Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead
 > Disaggregated serving in SGLang currently requires each worker to have the same tensor parallel size [unless you are using an MLA based model](https://github.com/sgl-project/sglang/pull/5922)
 
 ```bash
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/disagg.sh
 ```
 
@@ -129,12 +129,32 @@ You can use this configuration to test out disaggregated serving with dp attenti
 
 ```bash
 # note this will require 4 GPUs
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/disagg_dp_attn.sh
 ```
 
 When using MoE models, you can also use the our implementation of the native SGLang endpoints to record expert distribution data. The `disagg_dp_attn.sh` script automatically sets up the SGLang HTTP server, the environment variable that controls the expert distribution recording directory, and sets up the expert distribution recording mode to `stat`. You can learn more about expert parallelism load balancing [here](docs/expert-distribution-eplb.md).
 
+### Testing the Deployment
+
+Send a test request to verify your deployment:
+
+```bash
+curl localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "messages": [
+    {
+        "role": "user",
+        "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
+    }
+    ],
+    "stream": false,
+    "max_tokens": 30
+  }'
+```
+
 ## Request Migration
 
 You can enable [request migration](../../../docs/architecture/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:

diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml
@@ -21,7 +21,7 @@ spec:
           command:
             - /bin/sh
             - -c
-            - "exit 0"
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
         initialDelaySeconds: 60
         periodSeconds: 60
         timeoutSeconds: 30
@@ -31,54 +31,63 @@ spec:
       replicas: 1
       resources:
         requests:
-          cpu: "5"
+          cpu: "10"
           memory: "10Gi"
         limits:
-          cpu: "5"
-          memory: "10Gi"
+          cpu: "32"
+          memory: "40Gi"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
           workingDir: /workspace/components/backends/sglang
           command: ["sh", "-c"]
           args:
             - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000"
     SGLangDecodeWorker:
       envFromSecret: hf-token-secret
-      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
+      livenessProbe:        
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
         timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 1
       readinessProbe:
         exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
+        httpGet:
+          path: /health
+          port: 8000
+        periodSeconds: 10
         timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 60
       dynamoNamespace: sglang-disagg
       componentType: worker
       replicas: 1
       resources:
         requests:
           cpu: "10"
-          memory: "20Gi"
+          memory: "40Gi"
           gpu: "1"
         limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "80Gi"
           gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          startupProbe:
+            httpGet:
+              path: /live
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
           workingDir: /workspace/components/backends/sglang
           args:
             - "python3"
@@ -100,40 +109,49 @@ spec:
             - "nixl"
     SGLangPrefillWorker:
       envFromSecret: hf-token-secret
-      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
+      livenessProbe:        
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
         timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 1
       readinessProbe:
         exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
+        httpGet:
+          path: /health
+          port: 8000
+        periodSeconds: 10
         timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 60
       dynamoNamespace: sglang-disagg
       componentType: worker
       replicas: 1
       resources:
         requests:
           cpu: "10"
-          memory: "20Gi"
+          memory: "40Gi"
           gpu: "1"
         limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "80Gi"
           gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
       extraPodSpec:
         mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
           workingDir: /workspace/components/backends/sglang
           args:
             - "python3"