ai-dynamo · tedzhouhk · Jul 28, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 12, 2025
@@ -0,0 +1 @@
+../../docs/architecture/pre_deployment_profiling.md
@@ -589,9 +589,9 @@ async def run_profile(args):
     parser.add_argument(
         "--backend",
         type=str,
-        default="vllm_v1",
-        choices=["vllm_v1"],
-        help="backend type, currently support [vllm_v1]",
+        default="vllm",
+        choices=["vllm"],
+        help="backend type, currently support [vllm]",
     )
     parser.add_argument(
         "--config",

@@ -80,7 +80,7 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
         config = deepcopy(config)
 
         # set metadata name
-        config["metadata"]["name"] = "vllm-v1-agg"
+        config["metadata"]["name"] = "vllm-agg"
 
         # disable planner
         if "Planner" in config["spec"]["services"]:
@@ -89,16 +89,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
         if target == "prefill":
             # convert prefill worker into decode worker
             config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker
             ] = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
             ]
             del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
             ]
 
             args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker
             ]["extraPodSpec"]["mainContainer"]["args"]
 
             args = break_arguments(args)
@@ -112,18 +112,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--no-enable-prefix-caching" not in args:
                 args = append_argument(args, "--no-enable-prefix-caching")
 
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
                 "extraPodSpec"
             ]["mainContainer"]["args"] = join_arguments(args)
 
         elif target == "decode":
             # delete prefill worker
             del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
             ]
 
             args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker
             ]["extraPodSpec"]["mainContainer"]["args"]
 
             args = break_arguments(args)
@@ -134,13 +134,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--no-enable-prefix-caching" in args:
                 args.remove("--no-enable-prefix-caching")
 
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
                 "extraPodSpec"
             ]["mainContainer"]["args"] = join_arguments(args)
 
         # set num workers to 1
         decode_worker_config = config["spec"]["services"][
-            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker
         ]
         decode_worker_config["replicas"] = 1
 
@@ -150,16 +150,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
     def set_config_tp_size(cls, config: dict, tp_size: int):
         config = deepcopy(config)
 
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
             "resources"
         ]["requests"]["gpu"] = str(tp_size)
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
             "resources"
         ]["limits"]["gpu"] = str(tp_size)
 
-        args = config["spec"]["services"][
-            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-        ]["extraPodSpec"]["mainContainer"]["args"]
+        args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+            "extraPodSpec"
+        ]["mainContainer"]["args"]
 
         args = break_arguments(args)
 
@@ -169,15 +169,15 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
         except ValueError:
             args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
 
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
             "extraPodSpec"
         ]["mainContainer"]["args"] = join_arguments(args)
 
         return config
 
     @classmethod
     def get_model_name(cls, config: dict) -> str:
-        worker_name = WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+        worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
         args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
             "args"
         ]
@@ -232,5 +232,5 @@ def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
 
 
 CONFIG_MODIFIERS = {
-    "vllm_v1": VllmV1ConfigModifier,
+    "vllm": VllmV1ConfigModifier,
 }
@@ -17,9 +17,9 @@
 import asyncio
 import time
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
-import aiofiles
+import aiofiles  # type: ignore[import-untyped]
 import httpx  # added for HTTP requests
 import kubernetes_asyncio as kubernetes
 import yaml
@@ -62,9 +62,9 @@ def __init__(
         self.deployment_name = deployment_name
         self.model_name = model_name
         self.service_name = service_name or f"{deployment_name}-frontend"
-        self.components: list[str] = []  # Will store component names from CR
+        self.components: List[str] = []  # Will store component names from CR
         self.deployment_spec: Optional[
-            dict
+            Dict[str, Any]
         ] = None  # Will store the full deployment spec
         self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
         self.frontend_port = frontend_port

diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md
@@ -112,6 +112,7 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
 - `agg_router.yaml` - Aggregated serving with KV routing
 - `disagg.yaml` - Disaggregated serving
 - `disagg_router.yaml` - Disaggregated serving with KV routing
+- `disagg_planner.yaml` - Disaggregated serving with [SLA Planner](../../../docs/architecture/sla_planner.md). See [SLA Planner Deployment Guide](../../../docs/guides/dynamo_deploy/sla_planner_deployment.md) for more details.
 
 #### Prerequisites
 
@@ -124,6 +125,8 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
   # Update the image references in the YAML files
   ```
 
+- **Pre-Deployment Profiling (if Using SLA Planner)**: Follow the [pre-deployment profiling guide](../../../docs/architecture/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `profiling-pvc` PVC and queried by the SLA Planner.
+
 - **Port Forwarding**: After deployment, forward the frontend service to access the API:
   ```bash
   kubectl port-forward deployment/vllm-v1-disagg-frontend-<pod-uuid-info> 8080:8000

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -6,6 +6,13 @@ kind: DynamoGraphDeployment
 metadata:
   name: vllm-disagg-planner
 spec:
+  envs:
+    - name: DYNAMO_SERVICE_CONFIG
+      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
+    - name: DYNAMO_PORT
+      value: "8000"
+    - name: DYNAMO_NAMESPACE
+      value: "vllm-disagg-planner"
   services:
     Frontend:
       dynamoNamespace: vllm-disagg-planner
@@ -31,25 +38,114 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "1"
+          cpu: "32"
+          memory: "10Gi"
+        limits:
+          cpu: "32"
+          memory: "10Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
+          workingDir: /workspace/components/backends/vllm
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "python3 -m dynamo.frontend --http-port 8000"
+    Planner:
+      dynamoNamespace: vllm-disagg-planner
+      envFromSecret: hf-token-secret
+      componentType: planner
+      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "2"
           memory: "2Gi"
         limits:
-          cpu: "1"
+          cpu: "2"
           memory: "2Gi"
+      pvc:
+        create: false
+        name: profiling-pvc # Must be pre-created before deployment and SLA profiler must have been run
+        mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
+          workingDir: /workspace/components/planner/src/dynamo/planner
+          args:
+            - python
+            - -m
+            - planner_sla
+            - --environment=kubernetes
+            - --backend=vllm
+            - --adjustment-interval=60
+            - --profile-results-dir=/workspace/profiling_results
+    Prometheus:
+      dynamoNamespace: vllm-disagg-planner
+      componentType: main
+      replicas: 1
+      envs:
+        - name: PYTHONPATH
+          value: "/workspace/components/planner/src"
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 30
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "2"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.frontend --http-port 8000"
-    VllmDecodeWorker:
+            - "python3 -m dynamo.planner.prometheus"
+    backend:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
-      replicas: 1
+      replicas: 2
       livenessProbe:
         httpGet:
           path: /live
@@ -66,12 +162,12 @@ spec:
         failureThreshold: 60
       resources:
         requests:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
         limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
       envs:
         - name: DYN_SYSTEM_ENABLED
@@ -88,18 +184,18 @@ spec:
               port: 9090
             periodSeconds: 10
             failureThreshold: 60
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B  2>&1 | tee /tmp/vllm.log"
-    VllmPrefillWorker:
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
+    prefill:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
-      replicas: 1
+      replicas: 2
       livenessProbe:
         httpGet:
           path: /health
@@ -116,12 +212,12 @@ spec:
         failureThreshold: 60
       resources:
         requests:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
         limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
       envs:
         - name: DYN_SYSTEM_ENABLED
@@ -138,10 +234,10 @@ spec:
               port: 9090
             periodSeconds: 10
             failureThreshold: 60
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B  --is-prefill-worker 2>&1 | tee /tmp/vllm.log
+            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../docs/architecture/pre_deployment_profiling.md