feat: use consistent small models across all deploy examples (#2573)

biswapanda · web-flow · commit 4eb25632227f · 2025-08-20T18:39:39.000-07:00
diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -193,7 +193,7 @@ Send a test request to verify your deployment:
 curl localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "model": "Qwen/Qwen3-0.6B",
     "messages": [
     {
         "role": "user",
diff --git a/components/backends/sglang/deploy/agg.yaml b/components/backends/sglang/deploy/agg.yaml
@@ -32,8 +32,8 @@ spec:
           args:
             - >-
               python3 -m dynamo.sglang
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --page-size 16
               --tp 1
               --trust-remote-code
diff --git a/components/backends/sglang/deploy/agg_router.yaml b/components/backends/sglang/deploy/agg_router.yaml
@@ -35,8 +35,8 @@ spec:
           args:
             - >-
               python3 -m dynamo.sglang
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --page-size 16
               --tp 1
               --trust-remote-code
diff --git a/components/backends/sglang/deploy/disagg-multinode.yaml b/components/backends/sglang/deploy/disagg-multinode.yaml
@@ -68,8 +68,8 @@ spec:
           args:
             - >-
               python3 -m dynamo.sglang
-              --model-path meta-llama/Llama-3.3-70B-Instruct
-              --served-model-name meta-llama/Llama-3.3-70B-Instruct
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --tp-size 8
               --trust-remote-code
               --skip-tokenizer-init
diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml
@@ -32,8 +32,8 @@ spec:
           args:
             - >-
               python3 -m dynamo.sglang
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --page-size 16
               --tp 1
               --trust-remote-code
@@ -59,8 +59,8 @@ spec:
           args:
             - >-
               python3 -m dynamo.sglang
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --page-size 16
               --tp 1
               --trust-remote-code
diff --git a/components/backends/sglang/deploy/disagg_planner.yaml b/components/backends/sglang/deploy/disagg_planner.yaml
@@ -116,8 +116,8 @@ spec:
           args:
             - >-
               python3 -m dynamo.sglang
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --page-size 16
               --tp 1
               --trust-remote-code
@@ -142,8 +142,8 @@ spec:
           args:
             - >-
               python3 -m dynamo.sglang
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --page-size 16
               --tp 1
               --trust-remote-code
diff --git a/components/backends/sglang/docs/sgl-hicache-example.md b/components/backends/sglang/docs/sgl-hicache-example.md
@@ -11,7 +11,7 @@ This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dyna
 
 ```bash
 python -m dynamo.sglang \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
   --host 0.0.0.0 --port 8000 \
   --page-size 64 \
   --enable-hierarchical-cache \
@@ -39,7 +39,7 @@ python -m dynamo.frontend --http-port 8000
 curl localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "model": "Qwen/Qwen3-0.6B",
     "messages": [
       {
         "role": "user",
@@ -56,7 +56,7 @@ curl localhost:8000/v1/chat/completions \
 Run the perf script:
 ```bash
 bash -x /workspace/benchmarks/llm/perf.sh \
-  --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model Qwen/Qwen3-0.6B \
   --tensor-parallelism 1 \
   --data-parallelism 1 \
   --concurrency "2,4,8" \
diff --git a/components/backends/sglang/launch/agg.sh b/components/backends/sglang/launch/agg.sh
@@ -20,8 +20,8 @@ DYNAMO_PID=$!
 
 # run worker
 python3 -m dynamo.sglang \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
+  --served-model-name Qwen/Qwen3-0.6B \
   --page-size 16 \
   --tp 1 \
   --trust-remote-code \
diff --git a/components/backends/sglang/launch/agg_router.sh b/components/backends/sglang/launch/agg_router.sh
@@ -20,8 +20,8 @@ DYNAMO_PID=$!
 
 # run worker
 python3 -m dynamo.sglang \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
+  --served-model-name Qwen/Qwen3-0.6B \
   --page-size 16 \
   --tp 1 \
   --trust-remote-code \
@@ -30,8 +30,8 @@ python3 -m dynamo.sglang \
 WORKER_PID=$!
 
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
+  --served-model-name Qwen/Qwen3-0.6B \
   --page-size 16 \
   --tp 1 \
   --trust-remote-code \
diff --git a/components/backends/sglang/launch/disagg.sh b/components/backends/sglang/launch/disagg.sh
@@ -20,8 +20,8 @@ DYNAMO_PID=$!
 
 # run prefill worker
 python3 -m dynamo.sglang \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
+  --served-model-name Qwen/Qwen3-0.6B \
   --page-size 16 \
   --tp 1 \
   --trust-remote-code \
@@ -32,8 +32,8 @@ PREFILL_PID=$!
 
 # run decode worker
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
+  --served-model-name Qwen/Qwen3-0.6B \
   --page-size 16 \
   --tp 1 \
   --trust-remote-code \
diff --git a/components/backends/trtllm/deploy/agg.yaml b/components/backends/trtllm/deploy/agg.yaml
@@ -32,6 +32,6 @@ spec:
           args:
             - >-
               python3 -m dynamo.trtllm
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --extra-engine-args engine_configs/agg.yaml
diff --git a/components/backends/trtllm/deploy/agg_router.yaml b/components/backends/trtllm/deploy/agg_router.yaml
@@ -35,7 +35,7 @@ spec:
           args:
             - >-
               python3 -m dynamo.trtllm
-              --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-              --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+              --model-path Qwen/Qwen3-0.6B
+              --served-model-name Qwen/Qwen3-0.6B
               --extra-engine-args engine_configs/agg.yaml
               --publish-events-and-metrics
diff --git a/components/backends/trtllm/deploy/disagg.yaml b/components/backends/trtllm/deploy/disagg.yaml
@@ -30,7 +30,7 @@ spec:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
+            - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
     TRTLLMDecodeWorker:
       dynamoNamespace: trtllm-disagg
       envFromSecret: hf-token-secret
@@ -47,4 +47,4 @@ spec:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
+            - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
diff --git a/components/backends/trtllm/deploy/disagg_router.yaml b/components/backends/trtllm/deploy/disagg_router.yaml
@@ -33,7 +33,7 @@ spec:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
+            - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
     TRTLLMDecodeWorker:
       dynamoNamespace: trtllm-v1-disagg-router
       envFromSecret: hf-token-secret
@@ -50,4 +50,4 @@ spec:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
+            - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
diff --git a/components/backends/trtllm/launch/agg.sh b/components/backends/trtllm/launch/agg.sh
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Environment variables with defaults
-export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
-export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
 export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
 export MODALITY=${MODALITY:-"text"}
 # If you want to use multimodal, set MODALITY to "multimodal"
diff --git a/components/backends/trtllm/launch/agg_router.sh b/components/backends/trtllm/launch/agg_router.sh
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Environment variables with defaults
-export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
-export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
 export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
 
 # Setup cleanup trap
diff --git a/components/backends/trtllm/launch/disagg.sh b/components/backends/trtllm/launch/disagg.sh
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Environment variables with defaults
-export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
-export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
 export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
 export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
 export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
diff --git a/components/backends/trtllm/launch/disagg_router.sh b/components/backends/trtllm/launch/disagg_router.sh
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Environment variables with defaults
-export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
-export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
 export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
 export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
 export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}

Original file line number	Diff line number	Diff line change
`@@ -193,7 +193,7 @@ Send a test request to verify your deployment:`
`193`	`193`	`curl localhost:8000/v1/chat/completions \`
`194`	`194`	`-H "Content-Type: application/json" \`
`195`	`195`	`-d '{`
`196`		`- "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",`
	`196`	`+ "model": "Qwen/Qwen3-0.6B",`
`197`	`197`	`"messages": [`
`198`	`198`	`{`
`199`	`199`	`"role": "user",`