Skip to content
Merged
Show file tree
Hide file tree
Changes from 75 commits
Commits
Show all changes
77 commits
Select commit Hold shift + click to select a range
065cb2a
feat: update k8s deploy yamls to use binary/python3
hhzhang16 Jul 11, 2025
aee478c
config part working
tedzhouhk Jul 11, 2025
9455ad1
feat: add component type worker and bump image
hhzhang16 Jul 12, 2025
f3dd01a
fix: merge conflicts
mohammedabdulwahhab Jul 14, 2025
7de97ef
fix: using health checks exposed by dynamo-run
mohammedabdulwahhab Jul 14, 2025
16fd7f2
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-2…
hhzhang16 Jul 14, 2025
3a29913
Merge branch 'hannahz/dep-216-create-deploy-crds-for-vllm_v1-example'…
hhzhang16 Jul 14, 2025
51835db
fix: check for message in logs
mohammedabdulwahhab Jul 14, 2025
39b377f
Merge branch 'hannahz/dep-216-create-deploy-crds-for-vllm_v1-example'…
mohammedabdulwahhab Jul 14, 2025
dddb45f
Merge branch 'hannahz/dep-216-create-deploy-crds-for-vllm_v1-example'…
tedzhouhk Jul 14, 2025
34bc79c
define apis
tedzhouhk Jul 14, 2025
8c22d14
update script
tedzhouhk Jul 14, 2025
9856dde
fix: add dynamodeployment lib
mohammedabdulwahhab Jul 14, 2025
61a215b
fix: working client lib
mohammedabdulwahhab Jul 14, 2025
5141334
fix: working client lib
mohammedabdulwahhab Jul 14, 2025
8e25a29
integrate with utils.dynamo_deployment
tedzhouhk Jul 15, 2025
1d87164
fix: port forward works
mohammedabdulwahhab Jul 15, 2025
aaf4544
Merge branch 'hzhou/profile_vllmv1_k8s' of https://github.com/ai-dyna…
mohammedabdulwahhab Jul 15, 2025
65dec07
pc
tedzhouhk Jul 15, 2025
0af209b
add dep; bug fix
tedzhouhk Jul 15, 2025
918733a
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into hzhou…
tedzhouhk Jul 15, 2025
3f900ef
staging, port forward not working
tedzhouhk Jul 15, 2025
bd12d40
stage
tedzhouhk Jul 15, 2025
7ac43a9
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into hzhou…
mohammedabdulwahhab Jul 15, 2025
9971acf
fix: running script
mohammedabdulwahhab Jul 16, 2025
a5d8aca
fix: fix
mohammedabdulwahhab Jul 16, 2025
7b1d99a
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into hzhou…
tedzhouhk Jul 16, 2025
f8f9363
add logic to find a free port
tedzhouhk Jul 16, 2025
8e292f6
feat: add Kubernetes service account configuration for SLA profiling …
hhzhang16 Jul 17, 2025
d62731f
feat: use service DNS for interfacing with deployments when profiling…
hhzhang16 Jul 17, 2025
a1aea5a
Revert "feat: use service DNS for interfacing with deployments when p…
hhzhang16 Jul 17, 2025
06bfe3b
feat: use service DNS instead of port forwarding for K8s-deployed SLA…
hhzhang16 Jul 18, 2025
ff96b9e
add try-catch waiting for deployment
tedzhouhk Jul 18, 2025
5419885
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into hzhou…
tedzhouhk Jul 21, 2025
d2b6b00
feat: clean up outlying DGDs upon SLA profiling failure (#2016)
hhzhang16 Jul 21, 2025
450d371
add debug info
tedzhouhk Jul 22, 2025
d8ffe1a
Merge branch 'hzhou/profile_vllmv1_k8s' of https://github.com/ai-dyna…
tedzhouhk Jul 22, 2025
769c98e
sla planner
tedzhouhk Jul 22, 2025
e726d43
add choices
tedzhouhk Jul 22, 2025
3663c5c
Merge branch 'main' of github.com:ai-dynamo/dynamo into hzhou/sla-pla…
hhzhang16 Jul 22, 2025
ff6c491
feat: vllm_v1 -> vllm and remove vllm_v0 from planner
hhzhang16 Jul 22, 2025
6ebfe73
feat: remove local connector from init
hhzhang16 Jul 22, 2025
fb89fc2
feat: remove LocalConnector from core
hhzhang16 Jul 22, 2025
047cecb
feat: rework prometheus file for planner deployment
hhzhang16 Jul 22, 2025
894f2e7
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-2…
hhzhang16 Jul 23, 2025
cd268ca
Merge branch 'main' of github.com:ai-dynamo/dynamo into hzhou/sla-pla…
hhzhang16 Jul 23, 2025
0f5082c
deprecate old docs
tedzhouhk Jul 23, 2025
9751e65
Merge branch 'main' of https://github.com/ai-dynamo/dynamo into hzhou…
tedzhouhk Jul 23, 2025
c33713f
Merge branch 'hzhou/sla-planner-ux-refac' of github.com:ai-dynamo/dyn…
hhzhang16 Jul 24, 2025
33371db
feat: update prometheus to work
hhzhang16 Jul 24, 2025
60dd89d
feat: k8s connector scaling P/D in one call (#2103)
tedzhouhk Jul 25, 2025
61a5e9a
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-2…
hhzhang16 Jul 25, 2025
41f1ca0
fix: vllm_v1 -> vllm
hhzhang16 Jul 25, 2025
1584cd0
feat: remove unneeded files
hhzhang16 Jul 25, 2025
dd3f161
docs: update docs
hhzhang16 Jul 25, 2025
97f3f88
fix: vllm config in profiler
hhzhang16 Jul 25, 2025
9b34ee9
feat: wip but tentatively working planner, with documentation
hhzhang16 Jul 25, 2025
eb56dbb
fi: use provided namespace for decode
hhzhang16 Jul 25, 2025
b68779d
feat: use k8s deployment info instead of hardcoding prometheus endpoint
hhzhang16 Jul 25, 2025
c8e394d
fix: if no requests have been made yet, don't try to access list
hhzhang16 Jul 25, 2025
1bbfd8d
feat: use SLAPLannerDefaults port
hhzhang16 Jul 25, 2025
ba6b5c1
docs: clean up sla planner deployment docs
hhzhang16 Jul 25, 2025
e533dda
feat: use DYNAMO_NAMESPACE env var instead of --namespace arg
hhzhang16 Jul 26, 2025
a548d74
feat: fixes for working planner
hhzhang16 Jul 26, 2025
f6af0d5
feat: skip adjustments if no traffic
hhzhang16 Jul 26, 2025
445fe74
docs: doc updates for planner deployment
hhzhang16 Jul 26, 2025
f0999da
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-2…
hhzhang16 Jul 26, 2025
bab714c
feat: delete k8s.sh
hhzhang16 Jul 26, 2025
a29c397
docs: slight doc modification
hhzhang16 Jul 26, 2025
daa3c4e
update resources
tedzhouhk Jul 26, 2025
539ff3e
update readme
tedzhouhk Jul 26, 2025
f994128
feat: address coderabbit MR comments
hhzhang16 Jul 28, 2025
d44b042
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-2…
hhzhang16 Jul 28, 2025
f601f88
fix pytest
tedzhouhk Jul 28, 2025
69d64dc
mypy
tedzhouhk Jul 28, 2025
1fdf3e9
feat: addressing MR comments
hhzhang16 Jul 28, 2025
bf58bd0
feat: addressing MR comments
hhzhang16 Jul 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/profiler/README.md
6 changes: 3 additions & 3 deletions benchmarks/profiler/profile_sla.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,9 +589,9 @@ async def run_profile(args):
parser.add_argument(
"--backend",
type=str,
default="vllm_v1",
choices=["vllm_v1"],
help="backend type, currently support [vllm_v1]",
default="vllm",
choices=["vllm"],
help="backend type, currently support [vllm]",
)
parser.add_argument(
"--config",
Expand Down
36 changes: 18 additions & 18 deletions benchmarks/profiler/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
config = deepcopy(config)

# set metadata name
config["metadata"]["name"] = "vllm-v1-agg"
config["metadata"]["name"] = "vllm-agg"

# disable planner
if "Planner" in config["spec"]["services"]:
Expand All @@ -89,16 +89,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
if target == "prefill":
# convert prefill worker into decode worker
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker
] = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
]
del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
]

args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker
]["extraPodSpec"]["mainContainer"]["args"]

args = break_arguments(args)
Expand All @@ -112,18 +112,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
if "--no-enable-prefix-caching" not in args:
args = append_argument(args, "--no-enable-prefix-caching")

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)

elif target == "decode":
# delete prefill worker
del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
]

args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker
]["extraPodSpec"]["mainContainer"]["args"]

args = break_arguments(args)
Expand All @@ -134,13 +134,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
if "--no-enable-prefix-caching" in args:
args.remove("--no-enable-prefix-caching")

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)

# set num workers to 1
decode_worker_config = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker
]
decode_worker_config["replicas"] = 1

Expand All @@ -150,16 +150,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
def set_config_tp_size(cls, config: dict, tp_size: int):
config = deepcopy(config)

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"resources"
]["requests"]["gpu"] = str(tp_size)
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"resources"
]["limits"]["gpu"] = str(tp_size)

args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
]["extraPodSpec"]["mainContainer"]["args"]
args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"]

args = break_arguments(args)

Expand All @@ -169,15 +169,15 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
except ValueError:
args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)

return config

@classmethod
def get_model_name(cls, config: dict) -> str:
worker_name = WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
"args"
]
Expand Down Expand Up @@ -232,5 +232,5 @@ def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:


CONFIG_MODIFIERS = {
"vllm_v1": VllmV1ConfigModifier,
"vllm": VllmV1ConfigModifier,
}
8 changes: 4 additions & 4 deletions benchmarks/profiler/utils/dynamo_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
import asyncio
import time
from pathlib import Path
from typing import Optional, Union
from typing import Any, Dict, List, Optional, Union

import aiofiles
import aiofiles # type: ignore[import-untyped]
import httpx # added for HTTP requests
import kubernetes_asyncio as kubernetes
import yaml
Expand Down Expand Up @@ -62,9 +62,9 @@ def __init__(
self.deployment_name = deployment_name
self.model_name = model_name
self.service_name = service_name or f"{deployment_name}-frontend"
self.components: list[str] = [] # Will store component names from CR
self.components: List[str] = [] # Will store component names from CR
self.deployment_spec: Optional[
dict
Dict[str, Any]
] = None # Will store the full deployment spec
self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
self.frontend_port = frontend_port
Expand Down
3 changes: 3 additions & 0 deletions components/backends/vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
- `agg_router.yaml` - Aggregated serving with KV routing
- `disagg.yaml` - Disaggregated serving
- `disagg_router.yaml` - Disaggregated serving with KV routing
- `disagg_planner.yaml` - Disaggregated serving with [SLA Planner](../../../docs/architecture/sla_planner.md). See [SLA Planner Deployment Guide](../../../docs/guides/dynamo_deploy/sla_planner_deployment.md) for more details.

#### Prerequisites

Expand All @@ -124,6 +125,8 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
# Update the image references in the YAML files
```

- **Pre-Deployment Profiling (if Using SLA Planner)**: Follow the [pre-deployment profiling guide](../../../docs/architecture/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `profiling-pvc` PVC and queried by the SLA Planner.

- **Port Forwarding**: After deployment, forward the frontend service to access the API:
```bash
kubectl port-forward deployment/vllm-v1-disagg-frontend-<pod-uuid-info> 8080:8000
Expand Down
136 changes: 116 additions & 20 deletions components/backends/vllm/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-planner
spec:
envs:
- name: DYNAMO_SERVICE_CONFIG
value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
- name: DYNAMO_PORT
value: "8000"
- name: DYNAMO_NAMESPACE
value: "vllm-disagg-planner"
services:
Frontend:
dynamoNamespace: vllm-disagg-planner
Expand All @@ -31,25 +38,114 @@ spec:
failureThreshold: 10
resources:
requests:
cpu: "1"
cpu: "32"
memory: "10Gi"
limits:
cpu: "32"
memory: "10Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000"
Planner:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: planner
replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "2"
memory: "2Gi"
limits:
cpu: "1"
cpu: "2"
memory: "2Gi"
pvc:
create: false
name: profiling-pvc # Must be pre-created before deployment and SLA profiler must have been run
mountPoint: /workspace/profiling_results
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
workingDir: /workspace/components/planner/src/dynamo/planner
args:
- python
- -m
- planner_sla
- --environment=kubernetes
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results
Prometheus:
dynamoNamespace: vllm-disagg-planner
componentType: main
replicas: 1
envs:
- name: PYTHONPATH
value: "/workspace/components/planner/src"
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "2"
memory: "2Gi"
limits:
cpu: "2"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000"
VllmDecodeWorker:
- "python3 -m dynamo.planner.prometheus"
backend:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
replicas: 2
livenessProbe:
httpGet:
path: /live
Expand All @@ -66,12 +162,12 @@ spec:
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
cpu: "8"
memory: "16Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
cpu: "8"
memory: "16Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
Expand All @@ -88,18 +184,18 @@ spec:
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
prefill:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
replicas: 2
livenessProbe:
httpGet:
path: /health
Expand All @@ -116,12 +212,12 @@ spec:
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
cpu: "8"
memory: "16Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
cpu: "8"
memory: "16Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
Expand All @@ -138,10 +234,10 @@ spec:
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log
Loading
Loading