From 211bbab2bff7b976897d0d254b5d8a8274fe7c6e Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Tue, 14 Oct 2025 17:38:41 -0400 Subject: [PATCH 01/26] fix: circular rust dynamo-parsers, dynamo-llm dependency (#3607) (#3609) Signed-off-by: Graham King Signed-off-by: Harrison Saturley-Hall Co-authored-by: Graham King --- Cargo.lock | 1 - .../tests/parallel_tool_call_integration.rs | 0 lib/parsers/Cargo.toml | 3 --- lib/parsers/tests/mod.rs | 7 ------- 4 files changed, 11 deletions(-) rename lib/{parsers => llm}/tests/parallel_tool_call_integration.rs (100%) delete mode 100644 lib/parsers/tests/mod.rs diff --git a/Cargo.lock b/Cargo.lock index b008bd2998ec..5db37123158e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2219,7 +2219,6 @@ version = "0.6.0" dependencies = [ "anyhow", "dynamo-async-openai", - "dynamo-llm", "lazy_static", "num-traits", "openai-harmony", diff --git a/lib/parsers/tests/parallel_tool_call_integration.rs b/lib/llm/tests/parallel_tool_call_integration.rs similarity index 100% rename from lib/parsers/tests/parallel_tool_call_integration.rs rename to lib/llm/tests/parallel_tool_call_integration.rs diff --git a/lib/parsers/Cargo.toml b/lib/parsers/Cargo.toml index 75caba2fa874..c4d6fb93ea00 100644 --- a/lib/parsers/Cargo.toml +++ b/lib/parsers/Cargo.toml @@ -38,6 +38,3 @@ openai-harmony = "0.0.3" lazy_static = "1.5.0" rustpython-parser = "0.4.0" num-traits = "0.2" - -[dev-dependencies] -dynamo-llm = { workspace = true } diff --git a/lib/parsers/tests/mod.rs b/lib/parsers/tests/mod.rs deleted file mode 100644 index 575a064fee2e..000000000000 --- a/lib/parsers/tests/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Tests for tool calling functionality - -#[cfg(test)] -mod parallel_tool_call_integration; From 0b6ef5ff06385a31c75cd27640eac0d66ef22745 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Tue, 14 Oct 2025 17:39:09 -0400 Subject: [PATCH 02/26] chore: update the relevant my-registry and my-tag (#3611) Signed-off-by: Harrison Saturley-Hall --- Earthfile | 6 +++--- benchmarks/incluster/benchmark_job.yaml | 2 +- benchmarks/profiler/utils/config.py | 2 +- components/backends/sglang/deploy/README.md | 4 ++-- components/backends/sglang/deploy/agg.yaml | 4 ++-- components/backends/sglang/deploy/agg_logging.yaml | 4 ++-- components/backends/sglang/deploy/agg_router.yaml | 4 ++-- components/backends/sglang/deploy/disagg-multinode.yaml | 6 +++--- components/backends/sglang/deploy/disagg.yaml | 6 +++--- components/backends/sglang/deploy/disagg_planner.yaml | 8 ++++---- components/backends/trtllm/deploy/README.md | 6 +++--- components/backends/trtllm/deploy/agg-with-config.yaml | 4 ++-- components/backends/trtllm/deploy/agg.yaml | 4 ++-- components/backends/trtllm/deploy/agg_router.yaml | 4 ++-- components/backends/trtllm/deploy/disagg-multinode.yaml | 6 +++--- components/backends/trtllm/deploy/disagg.yaml | 6 +++--- components/backends/trtllm/deploy/disagg_planner.yaml | 8 ++++---- components/backends/trtllm/deploy/disagg_router.yaml | 6 +++--- components/backends/vllm/deploy/README.md | 4 ++-- components/backends/vllm/deploy/agg.yaml | 4 ++-- components/backends/vllm/deploy/agg_kvbm.yaml | 4 ++-- components/backends/vllm/deploy/agg_router.yaml | 4 ++-- components/backends/vllm/deploy/disagg-multinode.yaml | 6 +++--- components/backends/vllm/deploy/disagg.yaml | 6 +++--- components/backends/vllm/deploy/disagg_kvbm.yaml | 6 +++--- components/backends/vllm/deploy/disagg_kvbm_2p2d.yaml | 6 +++--- components/backends/vllm/deploy/disagg_kvbm_tp2.yaml | 6 +++--- components/backends/vllm/deploy/disagg_planner.yaml | 8 ++++---- components/backends/vllm/deploy/disagg_router.yaml | 6 +++--- deploy/cloud/operator/Earthfile | 2 +- docs/_includes/install.rst | 4 ++-- docs/backends/sglang/README.md | 2 +- docs/backends/trtllm/gpt-oss.md | 2 +- docs/benchmarks/benchmarking.md | 2 +- docs/kubernetes/create_deployment.md | 2 +- docs/kubernetes/sla_planner_quickstart.md | 2 +- .../kubernetes/Distributed_Inference/agg_router.yaml | 4 ++-- .../custom_backend/hello_world/deploy/hello_world.yaml | 4 ++-- examples/deployments/ECS/task_definition_frontend.json | 2 +- .../deployments/ECS/task_definition_prefillworker.json | 2 +- examples/multimodal/deploy/agg_llava.yaml | 8 ++++---- examples/multimodal/deploy/agg_qwen.yaml | 8 ++++---- recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 4 ++-- recipes/llama-3-70b/vllm/agg/deploy.yaml | 4 ++-- recipes/llama-3-70b/vllm/agg/perf.yaml | 2 +- recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml | 6 +++--- recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml | 2 +- recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml | 6 +++--- recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml | 2 +- tests/planner/perf_test_configs/agg_8b.yaml | 4 ++-- tests/planner/perf_test_configs/disagg_8b_2p2d.yaml | 6 +++--- tests/planner/perf_test_configs/disagg_8b_3p1d.yaml | 6 +++--- tests/planner/perf_test_configs/disagg_8b_planner.yaml | 8 ++++---- tests/planner/perf_test_configs/disagg_8b_tp2.yaml | 6 +++--- .../planner/perf_test_configs/image_cache_daemonset.yaml | 2 +- tests/planner/scaling/disagg_planner.yaml | 8 ++++---- 56 files changed, 130 insertions(+), 130 deletions(-) diff --git a/Earthfile b/Earthfile index 6a9482219403..9e7e35d9d5b7 100644 --- a/Earthfile +++ b/Earthfile @@ -134,7 +134,7 @@ dynamo-build: dynamo-base-docker: ARG IMAGE=dynamo-base-docker - ARG DOCKER_SERVER=my-registry + ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo ARG IMAGE_TAG=latest FROM ubuntu:24.04 @@ -175,7 +175,7 @@ all-test: BUILD ./deploy/cloud/operator+test all-docker: - ARG DOCKER_SERVER=my-registry + ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo ARG IMAGE_TAG=latest BUILD ./deploy/cloud/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG @@ -189,6 +189,6 @@ all: # For testing custom: - ARG DOCKER_SERVER=my-registry + ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo ARG IMAGE_TAG=latest BUILD +all-test diff --git a/benchmarks/incluster/benchmark_job.yaml b/benchmarks/incluster/benchmark_job.yaml index 1b22ad537b76..fe91db35a58f 100644 --- a/benchmarks/incluster/benchmark_job.yaml +++ b/benchmarks/incluster/benchmark_job.yaml @@ -18,7 +18,7 @@ spec: containers: - name: benchmark-runner # TODO: update to latest public image in next release - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py index 087e5c035a39..426dc80a6238 100644 --- a/benchmarks/profiler/utils/config.py +++ b/benchmarks/profiler/utils/config.py @@ -111,7 +111,7 @@ class DgdPlannerServiceConfig(BaseModel): volumeMounts: list[VolumeMount] = [VolumeMount()] extraPodSpec: PodSpec = PodSpec( mainContainer=Container( - image="my-registry/dynamo-runtime:my-tag", # placeholder + image="nvcr.io/nvidia/ai-dynamo/dynamo-runtime:0.6.0", # placeholder workingDir="/workspace/components/src/dynamo/planner", command=["python3", "-m", "planner_sla"], args=[], diff --git a/components/backends/sglang/deploy/README.md b/components/backends/sglang/deploy/README.md index b2ebff3da983..759b01ee3354 100644 --- a/components/backends/sglang/deploy/README.md +++ b/components/backends/sglang/deploy/README.md @@ -61,7 +61,7 @@ resources: ```yaml extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang args: - "python3" @@ -92,7 +92,7 @@ Edit the template to match your environment: ```yaml # Update image registry and tag -image: my-registry/sglang-runtime:my-tag +image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 # Configure your model args: diff --git a/components/backends/sglang/deploy/agg.yaml b/components/backends/sglang/deploy/agg.yaml index 8c444b384e17..055475d7ca15 100644 --- a/components/backends/sglang/deploy/agg.yaml +++ b/components/backends/sglang/deploy/agg.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 decode: envFromSecret: hf-token-secret dynamoNamespace: sglang-agg @@ -24,7 +24,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 diff --git a/components/backends/sglang/deploy/agg_logging.yaml b/components/backends/sglang/deploy/agg_logging.yaml index 93fa747c2bef..d00975f2472f 100644 --- a/components/backends/sglang/deploy/agg_logging.yaml +++ b/components/backends/sglang/deploy/agg_logging.yaml @@ -16,7 +16,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 decode: envFromSecret: hf-token-secret dynamoNamespace: sglang-agg @@ -27,7 +27,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 diff --git a/components/backends/sglang/deploy/agg_router.yaml b/components/backends/sglang/deploy/agg_router.yaml index 142f1932f82f..9c3c29ed5788 100644 --- a/components/backends/sglang/deploy/agg_router.yaml +++ b/components/backends/sglang/deploy/agg_router.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 envs: - name: DYN_ROUTER_MODE value: kv @@ -27,7 +27,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 diff --git a/components/backends/sglang/deploy/disagg-multinode.yaml b/components/backends/sglang/deploy/disagg-multinode.yaml index 97ed7c0bab05..15be05a8a541 100644 --- a/components/backends/sglang/deploy/disagg-multinode.yaml +++ b/components/backends/sglang/deploy/disagg-multinode.yaml @@ -22,7 +22,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 decode: multinode: nodeCount: 2 @@ -35,7 +35,7 @@ spec: gpu: "4" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 @@ -70,7 +70,7 @@ spec: gpu: "4" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml index 451a01d9c5af..73a9ca5347dc 100644 --- a/components/backends/sglang/deploy/disagg.yaml +++ b/components/backends/sglang/deploy/disagg.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 decode: envFromSecret: hf-token-secret dynamoNamespace: sglang-disagg @@ -25,7 +25,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 @@ -58,7 +58,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3E diff --git a/components/backends/sglang/deploy/disagg_planner.yaml b/components/backends/sglang/deploy/disagg_planner.yaml index a99befe30b69..835373fec4e0 100644 --- a/components/backends/sglang/deploy/disagg_planner.yaml +++ b/components/backends/sglang/deploy/disagg_planner.yaml @@ -16,7 +16,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 Planner: dynamoNamespace: dynamo envFromSecret: hf-token-secret @@ -27,7 +27,7 @@ spec: mountPoint: /data extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/src/dynamo/planner command: - python3 @@ -49,7 +49,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 @@ -81,7 +81,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/sglang-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - python3 diff --git a/components/backends/trtllm/deploy/README.md b/components/backends/trtllm/deploy/README.md index 315e518d01c2..8e2d24425a44 100644 --- a/components/backends/trtllm/deploy/README.md +++ b/components/backends/trtllm/deploy/README.md @@ -89,7 +89,7 @@ resources: ```yaml extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm args: - "python3" @@ -109,7 +109,7 @@ Before using these templates, ensure you have: ### Container Images -The deployment files currently require access to `my-registry/trtllm-runtime`. If you don't have access, build and push your own image: +The deployment files currently require access to `nvcr.io/nvidia/ai-dynamo/trtllm-runtime`. If you don't have access, build and push your own image: ```bash ./container/build.sh --framework tensorrtllm @@ -141,7 +141,7 @@ Edit the template to match your environment: ```yaml # Update image registry and tag -image: my-registry/trtllm-runtime:my-tag +image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 # Configure your model and deployment settings args: diff --git a/components/backends/trtllm/deploy/agg-with-config.yaml b/components/backends/trtllm/deploy/agg-with-config.yaml index dd15e56e65dc..9a636a4b98c8 100644 --- a/components/backends/trtllm/deploy/agg-with-config.yaml +++ b/components/backends/trtllm/deploy/agg-with-config.yaml @@ -34,7 +34,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.0 TRTLLMWorker: envFromSecret: hf-token-secret dynamoNamespace: trtllm-agg @@ -50,7 +50,7 @@ spec: configMap: name: nvidia-config mainContainer: - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm # mount the configmap as a volume volumeMounts: diff --git a/components/backends/trtllm/deploy/agg.yaml b/components/backends/trtllm/deploy/agg.yaml index c7187673e411..a1a5a38bec7f 100644 --- a/components/backends/trtllm/deploy/agg.yaml +++ b/components/backends/trtllm/deploy/agg.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 TRTLLMWorker: envFromSecret: hf-token-secret dynamoNamespace: trtllm-agg @@ -24,7 +24,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 diff --git a/components/backends/trtllm/deploy/agg_router.yaml b/components/backends/trtllm/deploy/agg_router.yaml index 787deb984740..5d7a8836451c 100644 --- a/components/backends/trtllm/deploy/agg_router.yaml +++ b/components/backends/trtllm/deploy/agg_router.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 envs: - name: DYN_ROUTER_MODE value: kv @@ -27,7 +27,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 diff --git a/components/backends/trtllm/deploy/disagg-multinode.yaml b/components/backends/trtllm/deploy/disagg-multinode.yaml index 3da492107a2f..307b3c5a456f 100644 --- a/components/backends/trtllm/deploy/disagg-multinode.yaml +++ b/components/backends/trtllm/deploy/disagg-multinode.yaml @@ -95,7 +95,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 @@ -127,7 +127,7 @@ spec: - name: nvidia-config mountPath: /workspace/components/backends/trtllm/engine_configs readOnly: true - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 @@ -167,7 +167,7 @@ spec: - name: nvidia-config mountPath: /workspace/components/backends/trtllm/engine_configs readOnly: true - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 diff --git a/components/backends/trtllm/deploy/disagg.yaml b/components/backends/trtllm/deploy/disagg.yaml index 9055967dfe39..4257d97a488a 100644 --- a/components/backends/trtllm/deploy/disagg.yaml +++ b/components/backends/trtllm/deploy/disagg.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 TRTLLMPrefillWorker: dynamoNamespace: trtllm-disagg envFromSecret: hf-token-secret @@ -25,7 +25,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 @@ -53,7 +53,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 diff --git a/components/backends/trtllm/deploy/disagg_planner.yaml b/components/backends/trtllm/deploy/disagg_planner.yaml index 09326e786d4f..83e4f2f7a891 100644 --- a/components/backends/trtllm/deploy/disagg_planner.yaml +++ b/components/backends/trtllm/deploy/disagg_planner.yaml @@ -16,7 +16,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 @@ -44,7 +44,7 @@ spec: mountPoint: /data extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/planner/src/dynamo/planner ports: - name: metrics @@ -85,7 +85,7 @@ spec: extraPodSpec: terminationGracePeriodSeconds: 600 mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 @@ -114,7 +114,7 @@ spec: extraPodSpec: terminationGracePeriodSeconds: 600 mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 diff --git a/components/backends/trtllm/deploy/disagg_router.yaml b/components/backends/trtllm/deploy/disagg_router.yaml index 31fde39e05f2..ebda442dc29b 100644 --- a/components/backends/trtllm/deploy/disagg_router.yaml +++ b/components/backends/trtllm/deploy/disagg_router.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 envs: - name: DYN_ROUTER_MODE value: kv @@ -27,7 +27,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 @@ -55,7 +55,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 workingDir: /workspace/components/backends/trtllm command: - python3 diff --git a/components/backends/vllm/deploy/README.md b/components/backends/vllm/deploy/README.md index e76aa030a653..7e726eec0b16 100644 --- a/components/backends/vllm/deploy/README.md +++ b/components/backends/vllm/deploy/README.md @@ -69,7 +69,7 @@ resources: ```yaml extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm args: - "python3" @@ -116,7 +116,7 @@ Edit the template to match your environment: ```yaml # Update image registry and tag -image: my-registry/vllm-runtime:my-tag +image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 # Configure your model args: diff --git a/components/backends/vllm/deploy/agg.yaml b/components/backends/vllm/deploy/agg.yaml index 95de87138a4e..c550f4702098 100644 --- a/components/backends/vllm/deploy/agg.yaml +++ b/components/backends/vllm/deploy/agg.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 VllmDecodeWorker: envFromSecret: hf-token-secret dynamoNamespace: vllm-agg @@ -24,7 +24,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/agg_kvbm.yaml b/components/backends/vllm/deploy/agg_kvbm.yaml index 84dc29778633..dd0eba3c772a 100644 --- a/components/backends/vllm/deploy/agg_kvbm.yaml +++ b/components/backends/vllm/deploy/agg_kvbm.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 VllmDecodeWorker: envFromSecret: hf-token-secret dynamoNamespace: vllm-agg-kvbm @@ -31,7 +31,7 @@ spec: value: "100" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/agg_router.yaml b/components/backends/vllm/deploy/agg_router.yaml index 999dd75f6411..ac328c2a7b77 100644 --- a/components/backends/vllm/deploy/agg_router.yaml +++ b/components/backends/vllm/deploy/agg_router.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 envs: - name: DYN_ROUTER_MODE value: kv @@ -27,7 +27,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/disagg-multinode.yaml b/components/backends/vllm/deploy/disagg-multinode.yaml index e46b2ed6afc5..e52f263a9212 100644 --- a/components/backends/vllm/deploy/disagg-multinode.yaml +++ b/components/backends/vllm/deploy/disagg-multinode.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -34,7 +34,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -57,7 +57,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/disagg.yaml b/components/backends/vllm/deploy/disagg.yaml index d7288a62da0f..b6bb638ce7e0 100644 --- a/components/backends/vllm/deploy/disagg.yaml +++ b/components/backends/vllm/deploy/disagg.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 VllmDecodeWorker: dynamoNamespace: vllm-disagg envFromSecret: hf-token-secret @@ -25,7 +25,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -45,7 +45,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/disagg_kvbm.yaml b/components/backends/vllm/deploy/disagg_kvbm.yaml index d3455a87ce8a..c4f73fd03bb2 100644 --- a/components/backends/vllm/deploy/disagg_kvbm.yaml +++ b/components/backends/vllm/deploy/disagg_kvbm.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 VllmDecodeWorker: dynamoNamespace: vllm-disagg-kvbm envFromSecret: hf-token-secret @@ -24,7 +24,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -56,7 +56,7 @@ spec: value: "100" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/disagg_kvbm_2p2d.yaml b/components/backends/vllm/deploy/disagg_kvbm_2p2d.yaml index 51c35050d0c3..134cea28b922 100644 --- a/components/backends/vllm/deploy/disagg_kvbm_2p2d.yaml +++ b/components/backends/vllm/deploy/disagg_kvbm_2p2d.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 VllmDecodeWorker: dynamoNamespace: vllm-disagg-kvbm-2p2d envFromSecret: hf-token-secret @@ -24,7 +24,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -60,7 +60,7 @@ spec: fieldPath: metadata.name extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/disagg_kvbm_tp2.yaml b/components/backends/vllm/deploy/disagg_kvbm_tp2.yaml index 30b5b0844006..947d47a42a8b 100644 --- a/components/backends/vllm/deploy/disagg_kvbm_tp2.yaml +++ b/components/backends/vllm/deploy/disagg_kvbm_tp2.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 VllmDecodeWorker: dynamoNamespace: vllm-disagg-kvbm-tp2 envFromSecret: hf-token-secret @@ -26,7 +26,7 @@ spec: gpu: "2" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -64,7 +64,7 @@ spec: fieldPath: metadata.name extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml index 5afbf58c70d9..184b705746d6 100644 --- a/components/backends/vllm/deploy/disagg_planner.yaml +++ b/components/backends/vllm/deploy/disagg_planner.yaml @@ -16,7 +16,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 Planner: dynamoNamespace: vllm-disagg-planner componentType: planner @@ -26,7 +26,7 @@ spec: mountPoint: /data extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/src/dynamo/planner command: - python3 @@ -48,7 +48,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -68,7 +68,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/components/backends/vllm/deploy/disagg_router.yaml b/components/backends/vllm/deploy/disagg_router.yaml index a298bdcfe519..06022f3089ac 100644 --- a/components/backends/vllm/deploy/disagg_router.yaml +++ b/components/backends/vllm/deploy/disagg_router.yaml @@ -13,7 +13,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 envs: - name: DYN_ROUTER_MODE value: kv @@ -27,7 +27,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -46,7 +46,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/deploy/cloud/operator/Earthfile b/deploy/cloud/operator/Earthfile index 4d368e12b52a..c80fe353f06c 100644 --- a/deploy/cloud/operator/Earthfile +++ b/deploy/cloud/operator/Earthfile @@ -45,7 +45,7 @@ test: SAVE ARTIFACT cover.out docker: - ARG DOCKER_SERVER=my-registry + ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo ARG IMAGE_TAG=latest ARG IMAGE_SUFFIX=dynamo-operator FROM nvcr.io/nvidia/distroless/go:v3.1.13 diff --git a/docs/_includes/install.rst b/docs/_includes/install.rst index 3403c6f827b9..bd005c556021 100644 --- a/docs/_includes/install.rst +++ b/docs/_includes/install.rst @@ -10,7 +10,7 @@ Install a pre-built wheel from PyPI. source venv/bin/activate # Install Dynamo from PyPI (choose one backend extra) - uv pip install "ai-dynamo[sglang]==my-tag" # or [vllm], [trtllm] + uv pip install "ai-dynamo[sglang]==0.6.0" # or [vllm], [trtllm] Pip from source @@ -41,4 +41,4 @@ Pull and run prebuilt images from NVIDIA NGC (`nvcr.io`). docker run --rm -it \ --gpus all \ --network host \ - nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag # or vllm, tensorrtllm + nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 # or vllm, tensorrtllm diff --git a/docs/backends/sglang/README.md b/docs/backends/sglang/README.md index 267ad871785b..553a3405f9bb 100644 --- a/docs/backends/sglang/README.md +++ b/docs/backends/sglang/README.md @@ -130,7 +130,7 @@ uv pip install --prerelease=allow -e .[sglang] Instructions ```bash -docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag +docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 ``` diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 5950fff66a33..6a11712724ae 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -49,7 +49,7 @@ huggingface-cli download openai/gpt-oss-120b --exclude "original/*" --exclude "m Set the container image: ```bash -export DYNAMO_CONTAINER_IMAGE=nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag +export DYNAMO_CONTAINER_IMAGE=nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.0 ``` Launch the Dynamo TensorRT-LLM container with the necessary configurations: diff --git a/docs/benchmarks/benchmarking.md b/docs/benchmarks/benchmarking.md index bfd76ecc83b8..1f9e154335cd 100644 --- a/docs/benchmarks/benchmarking.md +++ b/docs/benchmarks/benchmarking.md @@ -410,7 +410,7 @@ The benchmark job is configured directly in the YAML file. - **Model**: `Qwen/Qwen3-0.6B` - **Benchmark Name**: `qwen3-0p6b-vllm-agg` - **Service**: `vllm-agg-frontend:8000` -- **Docker Image**: `nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag` +- **Docker Image**: `nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0` ### Customizing the Job diff --git a/docs/kubernetes/create_deployment.md b/docs/kubernetes/create_deployment.md index c49ec9eabc2f..23b37d357ab6 100644 --- a/docs/kubernetes/create_deployment.md +++ b/docs/kubernetes/create_deployment.md @@ -158,7 +158,7 @@ When disabled, you can manually specify secrets as you would for a normal pod sp nvidia.com/disable-image-pull-secret-discovery: "true" extraPodSpec: imagePullSecrets: - - name: my-registry-secret + - name: nvcr.io/nvidia/ai-dynamo-secret - name: another-secret mainContainer: image: your-image diff --git a/docs/kubernetes/sla_planner_quickstart.md b/docs/kubernetes/sla_planner_quickstart.md index 6eccd17be596..8e65be24fa12 100644 --- a/docs/kubernetes/sla_planner_quickstart.md +++ b/docs/kubernetes/sla_planner_quickstart.md @@ -109,7 +109,7 @@ To automatically deploy the optimized DGD with planner after profiling, add `--d Set the container image and config path: ```bash -export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag +export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 export DGD_CONFIG_FILE=/data/configs/disagg.yaml ``` diff --git a/examples/basics/kubernetes/Distributed_Inference/agg_router.yaml b/examples/basics/kubernetes/Distributed_Inference/agg_router.yaml index 2438d81efd58..34ad6d0e8492 100644 --- a/examples/basics/kubernetes/Distributed_Inference/agg_router.yaml +++ b/examples/basics/kubernetes/Distributed_Inference/agg_router.yaml @@ -38,7 +38,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -95,7 +95,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 volumeMounts: - name: local-model-cache mountPath: /root/.cache diff --git a/examples/custom_backend/hello_world/deploy/hello_world.yaml b/examples/custom_backend/hello_world/deploy/hello_world.yaml index 0fb8f4238ee5..a274ff6fad8f 100644 --- a/examples/custom_backend/hello_world/deploy/hello_world.yaml +++ b/examples/custom_backend/hello_world/deploy/hello_world.yaml @@ -41,7 +41,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: my-registry/dynamo:my-tag + image: nvcr.io/nvidia/ai-dynamo/dynamo:0.6.0 workingDir: /workspace/examples/custom_backend/hello_world/ command: - /bin/sh @@ -80,7 +80,7 @@ spec: memory: "4Gi" extraPodSpec: mainContainer: - image: my-registry/dynamo:my-tag + image: nvcr.io/nvidia/ai-dynamo/dynamo:0.6.0 workingDir: /workspace/examples/custom_backend/hello_world/ command: - /bin/sh diff --git a/examples/deployments/ECS/task_definition_frontend.json b/examples/deployments/ECS/task_definition_frontend.json index fda0a2887669..99f77f310f2b 100644 --- a/examples/deployments/ECS/task_definition_frontend.json +++ b/examples/deployments/ECS/task_definition_frontend.json @@ -3,7 +3,7 @@ "containerDefinitions": [ { "name": "dynamo-vllm-frontend", - "image": "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag", + "image": "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0", "repositoryCredentials": { "credentialsParameter": "arn:aws:secretsmanager:AWS_REGION:AWS_ID:secret:ngc_nvcr_access" }, diff --git a/examples/deployments/ECS/task_definition_prefillworker.json b/examples/deployments/ECS/task_definition_prefillworker.json index fdf928bf3d0b..cd6d01912d64 100644 --- a/examples/deployments/ECS/task_definition_prefillworker.json +++ b/examples/deployments/ECS/task_definition_prefillworker.json @@ -3,7 +3,7 @@ "containerDefinitions": [ { "name": "dynamo-prefill", - "image": "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag", + "image": "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0", "repositoryCredentials": { "credentialsParameter": "arn:aws:secretsmanager:AWS_REGION:AWS_ID:secret:ngc_access" }, diff --git a/examples/multimodal/deploy/agg_llava.yaml b/examples/multimodal/deploy/agg_llava.yaml index 8c125666b19b..a215ff84b4a0 100644 --- a/examples/multimodal/deploy/agg_llava.yaml +++ b/examples/multimodal/deploy/agg_llava.yaml @@ -14,7 +14,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 EncodeWorker: envFromSecret: hf-token-secret dynamoNamespace: agg-llava @@ -25,7 +25,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/examples/multimodal command: - /bin/sh @@ -42,7 +42,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/examples/multimodal command: - /bin/sh @@ -59,7 +59,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/examples/multimodal command: - /bin/sh diff --git a/examples/multimodal/deploy/agg_qwen.yaml b/examples/multimodal/deploy/agg_qwen.yaml index 174979109a1b..0622411cf349 100644 --- a/examples/multimodal/deploy/agg_qwen.yaml +++ b/examples/multimodal/deploy/agg_qwen.yaml @@ -14,7 +14,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 EncodeWorker: envFromSecret: hf-token-secret dynamoNamespace: agg-qwen @@ -25,7 +25,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/examples/multimodal command: - /bin/sh @@ -42,7 +42,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/examples/multimodal command: - /bin/sh @@ -59,7 +59,7 @@ spec: gpu: "1" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/examples/multimodal command: - /bin/sh diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml index 16be6ffae0a5..efc4e5765da7 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml @@ -30,7 +30,7 @@ spec: command: - /bin/sh - -c - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 pvc: create: false mountPoint: /model-store @@ -69,7 +69,7 @@ spec: command: - /bin/sh - -c - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.0 env: - name: TRTLLM_ENABLE_PDL value: "1" diff --git a/recipes/llama-3-70b/vllm/agg/deploy.yaml b/recipes/llama-3-70b/vllm/agg/deploy.yaml index c6afcc21ac96..274b4633901e 100644 --- a/recipes/llama-3-70b/vllm/agg/deploy.yaml +++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml @@ -18,7 +18,7 @@ spec: mountPoint: /root/.cache/huggingface extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 1 VllmPrefillWorker: @@ -37,7 +37,7 @@ spec: command: - /bin/sh - -c - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 1 resources: diff --git a/recipes/llama-3-70b/vllm/agg/perf.yaml b/recipes/llama-3-70b/vllm/agg/perf.yaml index 28ddd4693a5b..a47f46ceaea8 100644 --- a/recipes/llama-3-70b/vllm/agg/perf.yaml +++ b/recipes/llama-3-70b/vllm/agg/perf.yaml @@ -16,7 +16,7 @@ spec: restartPolicy: Never containers: - name: perf - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh diff --git a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml index 9a40765db92b..ac1e012e9d10 100644 --- a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml +++ b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml @@ -18,7 +18,7 @@ spec: mountPoint: /root/.cache/huggingface extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 1 VllmPrefillWorker: @@ -37,7 +37,7 @@ spec: command: - /bin/sh - -c - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 1 resources: @@ -61,7 +61,7 @@ spec: command: - /bin/sh - -c - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 1 resources: diff --git a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml index 92fa5c75205c..df21b60cf261 100644 --- a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml @@ -16,7 +16,7 @@ spec: restartPolicy: Never containers: - name: perf - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh diff --git a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml index c833b15707e5..4cd67872f35e 100644 --- a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml +++ b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml @@ -18,7 +18,7 @@ spec: mountPoint: /root/.cache/huggingface extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 1 VllmPrefillWorker: @@ -47,7 +47,7 @@ spec: command: - /bin/sh - -c - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 2 resources: @@ -81,7 +81,7 @@ spec: command: - /bin/sh - -c - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm replicas: 1 resources: diff --git a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml index c6bbe575344b..89a69ed0d1cf 100644 --- a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml @@ -16,7 +16,7 @@ spec: restartPolicy: Never containers: - name: perf - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh diff --git a/tests/planner/perf_test_configs/agg_8b.yaml b/tests/planner/perf_test_configs/agg_8b.yaml index c176e62cec71..937e0b61f29c 100644 --- a/tests/planner/perf_test_configs/agg_8b.yaml +++ b/tests/planner/perf_test_configs/agg_8b.yaml @@ -38,7 +38,7 @@ spec: memory: "100Gi" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -88,7 +88,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh diff --git a/tests/planner/perf_test_configs/disagg_8b_2p2d.yaml b/tests/planner/perf_test_configs/disagg_8b_2p2d.yaml index 4b38a9a9b048..b71fc8105c7a 100644 --- a/tests/planner/perf_test_configs/disagg_8b_2p2d.yaml +++ b/tests/planner/perf_test_configs/disagg_8b_2p2d.yaml @@ -38,7 +38,7 @@ spec: memory: "100Gi" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -88,7 +88,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -138,7 +138,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh diff --git a/tests/planner/perf_test_configs/disagg_8b_3p1d.yaml b/tests/planner/perf_test_configs/disagg_8b_3p1d.yaml index 41618cf472a1..f9e053ce95aa 100644 --- a/tests/planner/perf_test_configs/disagg_8b_3p1d.yaml +++ b/tests/planner/perf_test_configs/disagg_8b_3p1d.yaml @@ -38,7 +38,7 @@ spec: memory: "100Gi" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -88,7 +88,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -138,7 +138,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh diff --git a/tests/planner/perf_test_configs/disagg_8b_planner.yaml b/tests/planner/perf_test_configs/disagg_8b_planner.yaml index f474fb637805..ddb052becc25 100644 --- a/tests/planner/perf_test_configs/disagg_8b_planner.yaml +++ b/tests/planner/perf_test_configs/disagg_8b_planner.yaml @@ -41,7 +41,7 @@ spec: memory: "100Gi" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -74,7 +74,7 @@ spec: failureThreshold: 10 extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/src/dynamo/planner ports: - name: metrics @@ -138,7 +138,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 @@ -195,7 +195,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - python3 diff --git a/tests/planner/perf_test_configs/disagg_8b_tp2.yaml b/tests/planner/perf_test_configs/disagg_8b_tp2.yaml index 3c83f78bcda6..6c4f2aaf33b5 100644 --- a/tests/planner/perf_test_configs/disagg_8b_tp2.yaml +++ b/tests/planner/perf_test_configs/disagg_8b_tp2.yaml @@ -38,7 +38,7 @@ spec: memory: "100Gi" extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -88,7 +88,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -138,7 +138,7 @@ spec: port: 9090 periodSeconds: 10 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh diff --git a/tests/planner/perf_test_configs/image_cache_daemonset.yaml b/tests/planner/perf_test_configs/image_cache_daemonset.yaml index 026337e5bab1..c258c83c1dc6 100644 --- a/tests/planner/perf_test_configs/image_cache_daemonset.yaml +++ b/tests/planner/perf_test_configs/image_cache_daemonset.yaml @@ -20,7 +20,7 @@ spec: - name: nvcr-imagepullsecret containers: - name: image-cache - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 command: - /bin/sh - -c diff --git a/tests/planner/scaling/disagg_planner.yaml b/tests/planner/scaling/disagg_planner.yaml index 0278b42bc840..53011ffe1d9a 100644 --- a/tests/planner/scaling/disagg_planner.yaml +++ b/tests/planner/scaling/disagg_planner.yaml @@ -18,7 +18,7 @@ spec: replicas: 1 extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 Planner: dynamoNamespace: vllm-disagg-planner envFromSecret: hf-token-secret @@ -41,7 +41,7 @@ spec: failureThreshold: 10 extraPodSpec: mainContainer: - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/src/dynamo/planner ports: - name: metrics @@ -78,7 +78,7 @@ spec: port: 9090 periodSeconds: 30 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh @@ -102,7 +102,7 @@ spec: port: 9090 periodSeconds: 30 failureThreshold: 60 - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 workingDir: /workspace/components/backends/vllm command: - /bin/sh From a25268de7eb6de9deb5f5b001ff4b19c5b74ea20 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Tue, 14 Oct 2025 17:57:50 -0400 Subject: [PATCH 03/26] chore: typo and new commands (#3617) (#3625) Signed-off-by: Harrison King Saturley-Hall Co-authored-by: ishandhanani <82981111+ishandhanani@users.noreply.github.com> --- components/backends/sglang/deploy/disagg.yaml | 2 +- components/backends/sglang/launch/disagg_dp_attn.sh | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml index 73a9ca5347dc..a1fbf5ab1805 100644 --- a/components/backends/sglang/deploy/disagg.yaml +++ b/components/backends/sglang/deploy/disagg.yaml @@ -61,7 +61,7 @@ spec: image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 workingDir: /workspace/components/backends/sglang command: - - python3E + - python3 - -m - dynamo.sglang args: diff --git a/components/backends/sglang/launch/disagg_dp_attn.sh b/components/backends/sglang/launch/disagg_dp_attn.sh index b5d8ba1bc5c8..ae35364c1344 100755 --- a/components/backends/sglang/launch/disagg_dp_attn.sh +++ b/components/backends/sglang/launch/disagg_dp_attn.sh @@ -16,10 +16,6 @@ trap cleanup EXIT INT TERM python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! -# Set the expert distribution recording directory -mkdir -p /tmp/sglang_expert_distribution_record -export SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR=/tmp/sglang_expert_distribution_record - # run prefill worker python3 -m dynamo.sglang \ --model-path silence09/DeepSeek-R1-Small-2layers \ @@ -31,7 +27,7 @@ python3 -m dynamo.sglang \ --trust-remote-code \ --disaggregation-mode prefill \ --disaggregation-transfer-backend nixl \ - --expert-distribution-recorder-mode stat \ + --load-balance-method round_robin \ --port 30000 & PREFILL_PID=$! @@ -46,5 +42,5 @@ CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang \ --trust-remote-code \ --disaggregation-mode decode \ --disaggregation-transfer-backend nixl \ - --expert-distribution-recorder-mode stat \ + --prefill-round-robin-balance \ --port 31000 From a61a800c6eb9b761378fae512f57fc3a13879a22 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Tue, 14 Oct 2025 18:25:31 -0400 Subject: [PATCH 04/26] feat: cherry pick PR#3306 benchmarks use aiperf (#3626) Signed-off-by: Biswa Panda Signed-off-by: lkomali Signed-off-by: Harrison King Saturley-Hall Co-authored-by: Biswa Panda Co-authored-by: lkomali Co-authored-by: Harshini Komali <157742537+lkomali@users.noreply.github.com> --- README.md | 2 +- benchmarks/README.md | 2 +- benchmarks/llm/perf.sh | 4 +- benchmarks/llm/plot_pareto.py | 36 ++++----- benchmarks/profiler/profile_endpoint.py | 4 +- benchmarks/profiler/profile_sla.py | 29 +++---- .../utils/{genai_perf.py => aiperf.py} | 78 +++++++++---------- benchmarks/profiler/utils/profile_cache.py | 12 +-- benchmarks/profiler/utils/profile_decode.py | 16 ++-- benchmarks/profiler/utils/profile_prefill.py | 12 +-- benchmarks/utils/genai.py | 22 +++--- benchmarks/utils/plot.py | 16 ++-- container/Dockerfile.sglang | 2 + container/Dockerfile.sglang-wideep | 4 +- container/Dockerfile.trtllm | 1 + container/Dockerfile.vllm | 1 + container/deps/requirements.txt | 5 +- docs/benchmarks/benchmarking.md | 18 ++--- recipes/llama-3-70b/vllm/agg/perf.yaml | 6 +- .../vllm/disagg-multi-node/perf.yaml | 6 +- .../vllm/disagg-single-node/perf.yaml | 6 +- 21 files changed, 144 insertions(+), 138 deletions(-) rename benchmarks/profiler/utils/{genai_perf.py => aiperf.py} (75%) diff --git a/README.md b/README.md index e64311bd3358..410eadbbc44d 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments: -- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using GenAI-Perf +- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf - **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements # Engines diff --git a/benchmarks/README.md b/benchmarks/README.md index 95cef75bd8b4..422bed831050 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -15,7 +15,7 @@ # Benchmarks -This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around genai-perf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints. +This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around aiperf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints. ## Quick Start diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh index 10ad466a6ed1..446ec1f74d0c 100644 --- a/benchmarks/llm/perf.sh +++ b/benchmarks/llm/perf.sh @@ -202,7 +202,7 @@ if [ $index -gt 0 ]; then echo "--------------------------------" fi -echo "Running genai-perf with:" +echo "Running aiperf with:" echo "Model: $model" echo "ISL: $isl" echo "OSL: $osl" @@ -214,7 +214,7 @@ for concurrency in "${concurrency_array[@]}"; do # NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like # `ignore_eos` since they are not in the official OpenAI spec. - genai-perf profile \ + aiperf profile \ --model ${model} \ --tokenizer ${model} \ --endpoint-type chat \ diff --git a/benchmarks/llm/plot_pareto.py b/benchmarks/llm/plot_pareto.py index 9e8a32ac1414..24d4743c41cb 100755 --- a/benchmarks/llm/plot_pareto.py +++ b/benchmarks/llm/plot_pareto.py @@ -26,7 +26,7 @@ def get_json_paths(search_paths): - genai_perf_profile_export_json_paths = [] + aiperf_profile_export_json_paths = [] deployment_config_json_paths = [] for search_path in search_paths: deployment_config_json_path = os.path.join( @@ -34,15 +34,13 @@ def get_json_paths(search_paths): ) if not os.path.exists(deployment_config_json_path): raise Exception(f"deployment_config.json not found in {search_path}") - for root, dirs, files in os.walk(search_path): + for root, _, files in os.walk(search_path): for file in files: - if file == "profile_export_genai_perf.json": - genai_perf_profile_export_json_paths.append( - os.path.join(root, file) - ) + if file == "profile_export_aiperf.json": + aiperf_profile_export_json_paths.append(os.path.join(root, file)) deployment_config_json_paths.append(deployment_config_json_path) - return genai_perf_profile_export_json_paths, deployment_config_json_paths + return aiperf_profile_export_json_paths, deployment_config_json_paths # search for -concurrency in the name @@ -81,13 +79,13 @@ def parse_kind_and_mode(deployment_config_json_path): def extract_val_and_concurrency( - genai_perf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg" + aiperf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg" ): results = [] - for genai_perf_profile_export_json_path, deployment_config_json_path in zip( - genai_perf_profile_export_json_paths, deployment_config_json_paths + for aiperf_profile_export_json_path, deployment_config_json_path in zip( + aiperf_profile_export_json_paths, deployment_config_json_paths ): - with open(genai_perf_profile_export_json_path, "r") as f: + with open(aiperf_profile_export_json_path, "r") as f: data = json.load(f) # output_token_throughput contains only avg output_token_throughput = data.get("output_token_throughput", {}).get("avg") @@ -99,7 +97,7 @@ def extract_val_and_concurrency( # request_throughput contains only avg request_throughput = data.get("request_throughput", {}).get("avg") - concurrency = parse_concurrency(genai_perf_profile_export_json_path) + concurrency = parse_concurrency(aiperf_profile_export_json_path) num_gpus = parse_gpus(deployment_config_json_path) kind, mode = parse_kind_and_mode(deployment_config_json_path) @@ -116,7 +114,7 @@ def extract_val_and_concurrency( results.append( { - "configuration": genai_perf_profile_export_json_path, + "configuration": aiperf_profile_export_json_path, "kind": kind, "mode": mode, "num_gpus": num_gpus, @@ -241,12 +239,12 @@ def pareto_efficient(ids, points): import os parser = argparse.ArgumentParser( - description="Plot Pareto graph from GenAI-Perf artifacts" + description="Plot Pareto graph from AIPerf artifacts" ) parser.add_argument( "--artifacts-root-dir", required=True, - help="Root directory containing artifact directories to search for profile_export_genai_perf.json files", + help="Root directory containing artifact directories to search for profile_export_aiperf.json files", ) parser.add_argument( "--title", @@ -260,16 +258,16 @@ def pareto_efficient(ids, points): if not artifacts_dirs: raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}") - genai_perf_profile_export_json_paths, deployment_config_json_paths = get_json_paths( + aiperf_profile_export_json_paths, deployment_config_json_paths = get_json_paths( artifacts_dirs ) - if len(genai_perf_profile_export_json_paths) != len(deployment_config_json_paths): + if len(aiperf_profile_export_json_paths) != len(deployment_config_json_paths): raise ValueError( - f"Number of genai_perf_profile_export_json_paths ({len(genai_perf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})" + f"Number of aiperf_profile_export_json_paths ({len(aiperf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})" ) extracted_values = extract_val_and_concurrency( - genai_perf_profile_export_json_paths, deployment_config_json_paths + aiperf_profile_export_json_paths, deployment_config_json_paths ) create_pareto_graph(extracted_values, title=args.title) diff --git a/benchmarks/profiler/profile_endpoint.py b/benchmarks/profiler/profile_endpoint.py index 53d8c02b053b..63f0daf0d929 100644 --- a/benchmarks/profiler/profile_endpoint.py +++ b/benchmarks/profiler/profile_endpoint.py @@ -5,8 +5,8 @@ import logging import os -from utils.profile_decode import profile_decode -from utils.profile_prefill import profile_prefill +from benchmarks.profiler.utils.profile_decode import profile_decode +from benchmarks.profiler.utils.profile_prefill import profile_prefill logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 2dfca14612d9..8c670986cffa 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -22,13 +22,13 @@ import numpy as np import yaml +from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill from benchmarks.profiler.utils.config import ( CONFIG_MODIFIERS, WORKER_COMPONENT_NAMES, generate_dgd_config_with_planner, ) from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator -from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser from benchmarks.profiler.utils.plot import ( plot_decode_performance, @@ -245,18 +245,18 @@ async def run_profile(args): f"Logs have been saved to {client.base_log_dir / client.deployment_name}" ) - # run genai-perf + # run ai-perf base_url = client.get_service_url() - genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}" - gap_result = benchmark_prefill( + ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{args.isl}" + aiperf_result = benchmark_prefill( args.isl, - genai_perf_artifact_dir, + ai_perf_artifact_dir, model_name, model_name, base_url=base_url, ) - if gap_result is not None: - ttft = gap_result["time_to_first_token"]["avg"] + if aiperf_result is not None: + ttft = aiperf_result["records"]["ttft"]["avg"] logger.info("Cleaning up deployment...") await client.delete_deployment() @@ -424,20 +424,23 @@ async def run_profile(args): ) else: base_url = client.get_service_url() - genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}" - gap_result = benchmark_decode( + ai_perf_artifact_dir = f"{work_dir}/aiperf_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}" + aiperf_result = benchmark_decode( args.isl, args.osl, num_request, - genai_perf_artifact_dir, + ai_perf_artifact_dir, model_name, model_name, base_url=base_url, ) - if gap_result is not None: - itl = gap_result["inter_token_latency"]["avg"] + if aiperf_result is not None: + itl = aiperf_result["records"]["inter_token_latency"]["avg"] thpt_per_gpu = ( - gap_result["output_token_throughput"]["avg"] / num_gpus + aiperf_result["records"]["output_token_throughput"][ + "avg" + ] + / num_gpus ) if itl is not None and thpt_per_gpu is not None: diff --git a/benchmarks/profiler/utils/genai_perf.py b/benchmarks/profiler/utils/aiperf.py similarity index 75% rename from benchmarks/profiler/utils/genai_perf.py rename to benchmarks/profiler/utils/aiperf.py index a053d19ab234..9e37af6f2aa5 100644 --- a/benchmarks/profiler/utils/genai_perf.py +++ b/benchmarks/profiler/utils/aiperf.py @@ -30,7 +30,7 @@ logger.addHandler(console_handler) -def _get_common_genai_perf_cmd( +def _get_common_aiperf_cmd( artifact_dir, seed=100, model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", @@ -38,7 +38,7 @@ def _get_common_genai_perf_cmd( base_url="http://localhost:8000", ): return [ - "genai-perf", + "aiperf", "profile", "--model", model, @@ -64,7 +64,7 @@ def _get_common_genai_perf_cmd( ] -def get_prefill_genai_perf_cmd( +def get_prefill_aiperf_cmd( isl, artifact_dir, seed=100, @@ -73,7 +73,7 @@ def get_prefill_genai_perf_cmd( osl=5, base_url="http://localhost:8000", ): - return _get_common_genai_perf_cmd( + return _get_common_aiperf_cmd( artifact_dir, seed, model, @@ -99,7 +99,7 @@ def get_prefill_genai_perf_cmd( ] -def get_decode_genai_perf_cmd( +def get_decode_aiperf_cmd( isl, osl, artifact_dir, @@ -109,7 +109,7 @@ def get_decode_genai_perf_cmd( tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", base_url="http://localhost:8000", ): - return _get_common_genai_perf_cmd( + return _get_common_aiperf_cmd( artifact_dir, seed, model, @@ -137,15 +137,15 @@ def get_decode_genai_perf_cmd( ] -def get_gap_result(artifact_dir: str) -> dict: +def get_aiperf_result(artifact_dir: str) -> dict: json_file_path = None for root, _, files in os.walk(artifact_dir): - if "profile_export_genai_perf.json" in files: - json_file_path = os.path.join(root, "profile_export_genai_perf.json") + if "profile_export_aiperf.json" in files: + json_file_path = os.path.join(root, "profile_export_aiperf.json") break if json_file_path is None: raise FileNotFoundError( - f"profile_export_genai_perf.json not found in {artifact_dir}" + f"profile_export_aiperf.json not found in {artifact_dir}" ) with open(json_file_path, "r") as f: return json.load(f) @@ -153,35 +153,35 @@ def get_gap_result(artifact_dir: str) -> dict: def benchmark_prefill( isl, - genai_perf_artifact_dir, + aiperf_artifact_dir, model_name, tokenizer, base_url="http://localhost:8000", ): - logger.info(f"Running genai-perf with isl {isl}") - genai_perf_cmd = get_prefill_genai_perf_cmd( + logger.info(f"Running aiperf with isl {isl}") + aiperf_cmd = get_prefill_aiperf_cmd( isl, - genai_perf_artifact_dir, + aiperf_artifact_dir, model=model_name, tokenizer=tokenizer, base_url=base_url, ) - print(f"genai-perf cmd: {genai_perf_cmd}") + print(f"aiperf cmd: {aiperf_cmd}") # import pdb; pdb.set_trace() - gap_process = subprocess.Popen( - genai_perf_cmd, + aiperf_process = subprocess.Popen( + aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) - stdout, stderr = gap_process.communicate() - if gap_process.returncode == 0: - logger.info("Genai-perf profiling completed successfully") + stdout, stderr = aiperf_process.communicate() + if aiperf_process.returncode == 0: + logger.info("AIperf profiling completed successfully") logger.info(stdout) - gap_result = get_gap_result(genai_perf_artifact_dir) - return gap_result + aiperf_result = get_aiperf_result(aiperf_artifact_dir) + return aiperf_result else: - logger.error(f"Genai-perf failed with error code: {gap_process.returncode}") + logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}") logger.error(f"stderr: {stderr}") return None @@ -190,7 +190,7 @@ def benchmark_decode( isl, osl, num_request, - genai_perf_artifact_dir, + aiperf_artifact_dir, model_name, tokenizer, base_url="http://localhost:8000", @@ -201,47 +201,47 @@ def benchmark_decode( # we use the same random seed to make sure the prompt is the same seed = random.randint(0, 1000000) - genai_perf_cmd = get_decode_genai_perf_cmd( + aiperf_cmd = get_decode_aiperf_cmd( isl, osl, - f"{genai_perf_artifact_dir}_warmup", + f"{aiperf_artifact_dir}_warmup", num_request, seed=seed, model=model_name, tokenizer=tokenizer, base_url=base_url, ) - gap_process = subprocess.Popen( - genai_perf_cmd, + aiperf_process = subprocess.Popen( + aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) - gap_process.communicate() + aiperf_process.communicate() # then send out the real requests, hopefully, this will skip all prefill computation - genai_perf_cmd = get_decode_genai_perf_cmd( + aiperf_cmd = get_decode_aiperf_cmd( isl, osl, - genai_perf_artifact_dir, + aiperf_artifact_dir, num_request, seed=seed, model=model_name, tokenizer=tokenizer, base_url=base_url, ) - gap_process = subprocess.Popen( - genai_perf_cmd, + aiperf_process = subprocess.Popen( + aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) - stdout, stderr = gap_process.communicate() - if gap_process.returncode == 0: - logger.info("Genai-perf profiling completed successfully") + stdout, stderr = aiperf_process.communicate() + if aiperf_process.returncode == 0: + logger.info("AIperf profiling completed successfully") logger.info(stdout) - gap_result = get_gap_result(genai_perf_artifact_dir) - return gap_result + aiperf_result = get_aiperf_result(aiperf_artifact_dir) + return aiperf_result else: - logger.error(f"Genai-perf failed with error code: {gap_process.returncode}") + logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}") logger.error(f"stderr: {stderr}") return None diff --git a/benchmarks/profiler/utils/profile_cache.py b/benchmarks/profiler/utils/profile_cache.py index edc422cca9e4..b9e0fc9fae9d 100644 --- a/benchmarks/profiler/utils/profile_cache.py +++ b/benchmarks/profiler/utils/profile_cache.py @@ -26,13 +26,13 @@ def check_prefill_results_exist(output_dir: str, tp_size: int, isl: int) -> bool: """Check if prefill results already exist for a given TP size.""" work_dir = f"{output_dir}/prefill_tp{tp_size}" - result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json" + result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json" # Check if the work directory exists if not os.path.exists(work_dir): return False - # Look for the genai-perf result file + # Look for the aiperf result file result_files = glob.glob(result_file) if not result_files: return False @@ -65,7 +65,7 @@ def check_decode_results_exist( # Look for at least one decode result file result_pattern = ( - f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json" + f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json" ) result_files = glob.glob(result_pattern) @@ -93,7 +93,7 @@ def load_existing_prefill_results( ) -> Tuple[Optional[float], Optional[float]]: """Load existing prefill results from disk.""" work_dir = f"{output_dir}/prefill_tp{tp_size}" - result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json" + result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json" result_files = glob.glob(result_file) if result_files: @@ -115,7 +115,7 @@ def load_existing_decode_results( work_dir = f"{output_dir}/decode_tp{tp_size}" result_pattern = ( - f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json" + f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json" ) result_files = glob.glob(result_pattern) @@ -128,7 +128,7 @@ def load_existing_decode_results( thpt_per_gpu = data["output_token_throughput"]["avg"] / tp_size # Extract concurrency from filename - match = re.search(r"gap_request(\d+)_", result_file) + match = re.search(r"aiperf_request(\d+)_", result_file) if match: concurrency = int(match.group(1)) decode_results.append((itl, thpt_per_gpu, concurrency)) diff --git a/benchmarks/profiler/utils/profile_decode.py b/benchmarks/profiler/utils/profile_decode.py index 0953e5a7ccbc..1a9cbf3d96fa 100644 --- a/benchmarks/profiler/utils/profile_decode.py +++ b/benchmarks/profiler/utils/profile_decode.py @@ -6,9 +6,9 @@ import numpy as np +from benchmarks.profiler.utils.aiperf import benchmark_decode from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator -from benchmarks.profiler.utils.genai_perf import benchmark_decode from benchmarks.profiler.utils.plot import plot_decode_3d_surface logger = logging.getLogger(__name__) @@ -113,19 +113,21 @@ def profile_decode( attention_dp_size, ): def get_itl_and_thpt_per_gpu(isl, osl, num_request): - genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}" - gap_result = benchmark_decode( + ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}_osl{osl}_n{num_request}" + aiperf_result = benchmark_decode( isl, osl, num_request, - genai_perf_artifact_dir, + ai_perf_artifact_dir, model_name, tokenizer, base_url=url, ) - if gap_result is not None: - itl = gap_result["inter_token_latency"]["avg"] - thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / num_gpus + if aiperf_result is not None: + itl = aiperf_result["records"]["inter_token_latency"]["avg"] + thpt_per_gpu = ( + aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus + ) return itl, thpt_per_gpu return None, None diff --git a/benchmarks/profiler/utils/profile_prefill.py b/benchmarks/profiler/utils/profile_prefill.py index 1be07ee0bfc1..d7f5dae91bf0 100644 --- a/benchmarks/profiler/utils/profile_prefill.py +++ b/benchmarks/profiler/utils/profile_prefill.py @@ -6,8 +6,8 @@ import numpy as np +from benchmarks.profiler.utils.aiperf import benchmark_prefill from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator -from benchmarks.profiler.utils.genai_perf import benchmark_prefill from benchmarks.profiler.utils.plot import plot_prefill_interpolation logger = logging.getLogger(__name__) @@ -81,16 +81,16 @@ def profile_prefill( interpolation_granularity, ): def get_ttft(isl): - genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}" - gap_result = benchmark_prefill( + ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}" + aiperf_result = benchmark_prefill( isl, - genai_perf_artifact_dir, + ai_perf_artifact_dir, model_name, tokenizer, base_url=url, ) - if gap_result is not None: - return gap_result["time_to_first_token"]["avg"] + if aiperf_result is not None: + return aiperf_result["records"]["ttft"]["avg"] return None return _profile_prefill_helper( diff --git a/benchmarks/utils/genai.py b/benchmarks/utils/genai.py index 54b141a85be7..3e46f6e194ce 100644 --- a/benchmarks/utils/genai.py +++ b/benchmarks/utils/genai.py @@ -33,7 +33,7 @@ def get_concurrency_levels() -> List[int]: CONCURRENCIES: List[int] = get_concurrency_levels() -def run_genai_perf( +def run_aiperf( service_url: str, model_name: str, isl: int, @@ -44,7 +44,7 @@ def run_genai_perf( ) -> None: output_dir.mkdir(parents=True, exist_ok=True) cmd = [ - "genai-perf", + "aiperf", "profile", "-m", model_name, @@ -76,28 +76,28 @@ def run_genai_perf( "--max-threads=300", ] print( - f"Running genai-perf with isl {isl}, osl {osl}, concurrency {concurrency}", + f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}", flush=True, ) - gap_process = subprocess.Popen( + aip_process = subprocess.Popen( cmd, cwd=str(output_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) - stdout, stderr = gap_process.communicate() - if gap_process.returncode == 0: - print("Genai-perf profiling completed successfully", flush=True) + stdout, stderr = aip_process.communicate() + if aip_process.returncode == 0: + print("Aiperf profiling completed successfully", flush=True) if stdout: print(stdout) else: - print(f"Genai-perf failed with error code: {gap_process.returncode}") + print(f"Aiperf failed with error code: {aip_process.returncode}") if stderr: print(f"stderr: {stderr}") raise subprocess.CalledProcessError( - gap_process.returncode, cmd, output=stdout, stderr=stderr + aip_process.returncode, cmd, output=stdout, stderr=stderr ) @@ -113,6 +113,4 @@ def run_concurrency_sweep( for c in concurrency_levels: print(f"Starting concurrency level {c}", flush=True) - run_genai_perf( - service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}" - ) + run_aiperf(service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}") diff --git a/benchmarks/utils/plot.py b/benchmarks/utils/plot.py index 4af27d354886..8ce1151ec420 100644 --- a/benchmarks/utils/plot.py +++ b/benchmarks/utils/plot.py @@ -32,22 +32,22 @@ def parse_benchmark_results(result_dir: Path) -> List[Tuple[int, Dict]]: continue concurrency = int(match.group(1)) - # Find the genai-perf JSON file - genai_perf_json = None - for json_file in concurrency_dir.rglob("profile_export_genai_perf.json"): - genai_perf_json = json_file + # Find the aiperf JSON file + aiperf_json = None + for json_file in concurrency_dir.rglob("profile_export_aiperf.json"): + aiperf_json = json_file break - if genai_perf_json and genai_perf_json.exists(): + if aiperf_json and aiperf_json.exists(): try: - with open(genai_perf_json, "r") as f: + with open(aiperf_json, "r") as f: metrics = json.load(f) results.append((concurrency, metrics)) print(f"Loaded metrics for concurrency {concurrency}") except Exception as e: - print(f"Error loading {genai_perf_json}: {e}") + print(f"Error loading {aiperf_json}: {e}") else: - print(f"Warning: No genai-perf JSON found for {concurrency_dir}") + print(f"Warning: No aiperf JSON found for {concurrency_dir}") # Sort by concurrency level results.sort(key=lambda x: x[0]) diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index b8eec3eb11e7..7d8072cf5c9f 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -64,6 +64,7 @@ RUN apt-get update -y \ # Python runtime - CRITICAL for virtual environment to work python${PYTHON_VERSION}-dev \ build-essential \ + git \ # SGLang build dependencies cmake \ ibverbs-providers \ @@ -147,6 +148,7 @@ RUN apt-get update && \ build-essential \ # jq and curl for polling various endpoints and health checks jq \ + git \ curl \ # Libraries required by UCX to find RDMA devices libibverbs1 rdma-core ibverbs-utils libibumad3 \ diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep index 3ce16e62e109..b81795bd1445 100644 --- a/container/Dockerfile.sglang-wideep +++ b/container/Dockerfile.sglang-wideep @@ -17,7 +17,7 @@ ARG CARGO_BUILD_JOBS="16" RUN apt-get update -y && \ apt-get install -y \ cmake meson ninja-build pybind11-dev patchelf net-tools \ - build-essential protobuf-compiler libssl-dev pkg-config \ + build-essential protobuf-compiler libssl-dev pkg-config git \ clang libclang-dev git rapidjson-dev zlib1g-dev jq && \ pip install --break-system-packages meson-python wheel build @@ -128,7 +128,7 @@ RUN git clone --depth=1 \ cmake --build perf_analyzer/build -- -j$(nproc) ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH -RUN pip install --break-system-packages genai-perf +RUN pip install --break-system-packages aiperf # Enable forceful shutdown of inflight requests ENV SGL_FORCE_SHUTDOWN=1 diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm index 2d4f6e0d2f43..273d746d82ce 100644 --- a/container/Dockerfile.trtllm +++ b/container/Dockerfile.trtllm @@ -76,6 +76,7 @@ RUN apt-get update && \ build-essential \ g++ \ ninja-build \ + git \ # Python runtime - CRITICAL for virtual environment to work python${PYTHON_VERSION}-dev \ python3-pip \ diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index ff1646169244..bb0416ac07db 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -187,6 +187,7 @@ RUN apt-get update && \ build-essential \ # jq and curl for polling various endpoints and health checks jq \ + git \ curl \ # Libraries required by UCX to find RDMA devices libibverbs1 rdma-core ibverbs-utils libibumad3 \ diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt index 696bc4632391..04082f67c70b 100644 --- a/container/deps/requirements.txt +++ b/container/deps/requirements.txt @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 accelerate==1.6.0 +aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@e46d9089ffe4f5dd62c46914489c55b6dfdbc903 aiofiles +aiperf @ git+https://github.com/ai-dynamo/aiperf.git@e8f69abf180ff9ea96de9f9a8c955df8c024625b av==15.0.0 fastapi==0.115.12 ftfy @@ -15,7 +17,6 @@ kubernetes_asyncio matplotlib msgspec mypy -numpy==1.26.4 # pmdarima is not compatible with numpy 2 nvidia-ml-py==13.580.65 opentelemetry-api opentelemetry-sdk @@ -26,7 +27,7 @@ prometheus-api-client prometheus_client prophet protobuf==5.29.5 -pydantic==2.10.6 +pydantic>=2.10.6 pyright PyYAML scikit-learn diff --git a/docs/benchmarks/benchmarking.md b/docs/benchmarks/benchmarking.md index 1f9e154335cd..8aebb9780fb6 100644 --- a/docs/benchmarks/benchmarking.md +++ b/docs/benchmarks/benchmarking.md @@ -61,7 +61,7 @@ Just quick testing/comparison? Client-side. ## What This Tool Does -The framework is a Python-based wrapper around `genai-perf` that: +The framework is a Python-based wrapper around `aiperf` that: - Benchmarks any HTTP endpoints - Runs concurrency sweeps across configurable load levels - Generates comparison plots with your custom labels @@ -70,7 +70,7 @@ The framework is a Python-based wrapper around `genai-perf` that: **Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`) -**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s). +**Important**: The `--model` parameter configures AIPerf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s). --- @@ -165,7 +165,7 @@ REQUIRED: OPTIONS: -h, --help Show help message and examples - -m, --model MODEL Model name for GenAI-Perf configuration and logging (default: Qwen/Qwen3-0.6B) + -m, --model MODEL Model name for AIPerf configuration and logging (default: Qwen/Qwen3-0.6B) NOTE: This must match the model deployed at the endpoint -i, --isl LENGTH Input sequence length (default: 2000) -s, --std STDDEV Input sequence standard deviation (default: 10) @@ -179,14 +179,14 @@ OPTIONS: - **Benchmark Name**: The benchmark name becomes the label in plots and results - **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved. - **Port-Forwarding**: You must have an exposed endpoint before benchmarking -- **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, and must match the model deployed at the endpoint +- **Model Parameter**: The `--model` parameter configures AIPerf for testing and logging, and must match the model deployed at the endpoint - **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately ### What Happens During Benchmarking The Python benchmarking module: 1. **Connects** to your port-forwarded endpoint -2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250) +2. **Benchmarks** using AIPerf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250) 3. **Measures** key metrics: latency, throughput, time-to-first-token 4. **Saves** results to an output directory organized by benchmark name @@ -301,9 +301,9 @@ results/ ``` Each concurrency directory contains: -- **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf -- **`profile_export_genai_perf.csv`** - CSV format metrics from GenAI-Perf -- **`profile_export.json`** - Raw GenAI-Perf results +- **`profile_export_aiperf.json`** - Structured metrics from AIPerf +- **`profile_export_aiperf.csv`** - CSV format metrics from AIPerf +- **`profile_export.json`** - Raw AIPerf results - **`inputs.json`** - Generated test inputs --- @@ -516,7 +516,7 @@ kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE" ## Customize Benchmarking Behavior -The built-in Python workflow connects to endpoints, benchmarks with genai-perf, and generates plots. If you want to modify the behavior: +The built-in Python workflow connects to endpoints, benchmarks with aiperf, and generates plots. If you want to modify the behavior: 1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection diff --git a/recipes/llama-3-70b/vllm/agg/perf.yaml b/recipes/llama-3-70b/vllm/agg/perf.yaml index a47f46ceaea8..b750eb709c64 100644 --- a/recipes/llama-3-70b/vllm/agg/perf.yaml +++ b/recipes/llama-3-70b/vllm/agg/perf.yaml @@ -38,7 +38,7 @@ spec: mkdir -p "$ARTIFACT_DIR" echo "Running benchmark..." export COLUMNS=200 - genai-perf profile \ + aiperf profile \ --model "$TARGET_MODEL" \ --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --endpoint-type chat --url "$ENDPOINT" --streaming \ @@ -58,10 +58,10 @@ spec: --num-dataset-entries=3000 -- \ --max-threads 64 echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json) + PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) cat $PERF_JSON | jq . echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv) + PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) cat $PERF_CSV echo "Benchmark completed successfully!" volumeMounts: diff --git a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml index df21b60cf261..aa8eea8544cb 100644 --- a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml @@ -38,7 +38,7 @@ spec: mkdir -p "$ARTIFACT_DIR" echo "Running benchmark..." export COLUMNS=200 - genai-perf profile \ + aiperf profile \ --model "$TARGET_MODEL" \ --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --endpoint-type chat --url "$ENDPOINT" --streaming \ @@ -58,10 +58,10 @@ spec: --num-dataset-entries=3000 -- \ --max-threads 64 echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json) + PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) cat $PERF_JSON | jq . echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv) + PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) cat $PERF_CSV echo "Benchmark completed successfully!" volumeMounts: diff --git a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml index 89a69ed0d1cf..d0f1f2dcb300 100644 --- a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml @@ -38,7 +38,7 @@ spec: mkdir -p "$ARTIFACT_DIR" echo "Running benchmark..." export COLUMNS=200 - genai-perf profile \ + aiperf profile \ --model "$TARGET_MODEL" \ --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --endpoint-type chat --url "$ENDPOINT" --streaming \ @@ -58,10 +58,10 @@ spec: --num-dataset-entries=3000 -- \ --max-threads 64 echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json) + PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) cat $PERF_JSON | jq . echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv) + PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) cat $PERF_CSV echo "Benchmark completed successfully!" volumeMounts: From c4b41fd44e0698712e6f6e58fc77aa3b90d82fe1 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 14 Oct 2025 15:26:00 -0700 Subject: [PATCH 05/26] feat: add pre-deployment check for storageclass (#3573) (#3608) Signed-off-by: Harrison Saturley-Hall Co-authored-by: Harrison Saturley-Hall --- benchmarks/nixl/README.md | 32 -- deploy/cloud/pre-deployment/README.md | 172 ++++++++ deploy/cloud/pre-deployment/nixl/README.md | 297 +++++++++++++ .../pre-deployment/nixl/build_and_deploy.sh | 413 ++++++++++++++++++ .../nixl/nixlbench-deployment.yaml | 18 +- .../pre-deployment/pre-deployment-check.sh | 283 ++++++++++++ docs/kubernetes/README.md | 5 + 7 files changed, 1182 insertions(+), 38 deletions(-) delete mode 100644 benchmarks/nixl/README.md create mode 100644 deploy/cloud/pre-deployment/README.md create mode 100644 deploy/cloud/pre-deployment/nixl/README.md create mode 100755 deploy/cloud/pre-deployment/nixl/build_and_deploy.sh rename benchmarks/nixl/nixl-benchmark-deployment.yaml => deploy/cloud/pre-deployment/nixl/nixlbench-deployment.yaml (54%) create mode 100755 deploy/cloud/pre-deployment/pre-deployment-check.sh diff --git a/benchmarks/nixl/README.md b/benchmarks/nixl/README.md deleted file mode 100644 index 071992e4dc8a..000000000000 --- a/benchmarks/nixl/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# NIXL Benchmark Technical Documentation (Kubernetes) - -This guide describes how to run the NIXL benchmark using the provided Docker image on a Kubernetes (K8s) cluster. - ---- - -## Prerequisites - -- A running Kubernetes cluster with access to NVIDIA GPUs (e.g., using NVIDIA GPU Operator or device plugin) -- `kubectl` configured to access your cluster -- deploy dynamo cloud in a namespace - ---- - -## 1. Prepare the Kubernetes Deployment - -A sample deployment YAML is provided in this repository: -`benchmarks/nixl/nixl-benchmark-deployment.yaml` - -Update the image field in sample yaml to appropiate image in your registry. - -You can use the `yq` tool to update the image field in the deployment YAML -```bash -yq -i '.spec.template.spec.containers[] |= select(.name == "nixl-benchmark") .image = "your-registry/your-nixl-benchmark:your-tag"' benchmarks/nixl/nixl-benchmark-deployment.yaml > nixl-benchmark-deployment.yaml -``` - -## 2. Deploy using kubectl -Launch using the command below: - -```bash -kubectl apply -f nixl-benchmark-deployment.yaml -``` \ No newline at end of file diff --git a/deploy/cloud/pre-deployment/README.md b/deploy/cloud/pre-deployment/README.md new file mode 100644 index 000000000000..9bcb79e589ff --- /dev/null +++ b/deploy/cloud/pre-deployment/README.md @@ -0,0 +1,172 @@ + + +# Pre-Deployment Check Script + +This directory contains a pre-deployment check script that verifies your Kubernetes cluster meets the requirements for deploying Dynamo. + +- For NCCL tests, please refer to the [NCCL tests](https://docs.nebius.com/kubernetes/gpu/nccl-test#run-tests) for more details. + +- For NIXL benchmark, please refer to the [NIXL benchmark pre-deployment checks](/deploy/cloud/pre-deployment/nixl/README.md) for more details. + +## Usage + +Run the pre-deployment check before deploying Dynamo: + +```bash +./pre-deployment-check.sh +``` + +## What it checks + +The script performs few checks and provides a detailed summary: + +### 1. kubectl Connectivity +- Verifies that `kubectl` is installed and kubectl can connect to your Kubernetes cluster + +### 2. Default StorageClass +- Verifies that a default StorageClass is configured in your cluster +- If no default StorageClass is found: + - Lists all available StorageClasses in the cluster with full details + - Provides a sample command to set a StorageClass as default + - References the official Kubernetes documentation for detailed guidance + +### 3. Cluster GPU Resources +- Checks for GPU-enabled nodes in the cluster using label `nvidia.com/gpu.present=true` + +## Sample Output + +### Complete Script Output Example: +``` +======================================== + Dynamo Pre-Deployment Check Script +======================================== + +--- Checking kubectl connectivity --- +✅ kubectl is available and cluster is accessible + +--- Checking for default StorageClass --- +❌ No default StorageClass found + +Dynamo requires a default StorageClass for persistent volume provisioning. +Please configure a default StorageClass before proceeding with deployment. + +Available StorageClasses in your cluster: +NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE +my-default-storage-class (default) compute.csi.mock Delete WaitForFirstConsumer true 65d +fast-ssd-storage kubernetes.io/gce-pd Delete Immediate true 30d + +To set a StorageClass as default, use the following command: +kubectl patch storageclass -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' + +Example with your first available StorageClass: +kubectl patch storageclass my-default-storage-class -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' + +For more information on managing default StorageClasses, visit: +https://kubernetes.io/docs/tasks/administer-cluster/change-default-storage-class/ + +--- Checking cluster gpu resources --- +✅ Found 17 gpu node(s) in the cluster +Node information: + +--- Pre-Deployment Check Summary --- +✅ kubectl Connectivity: PASSED +❌ Default StorageClass: FAILED +✅ Cluster Resources: PASSED + +Summary: 2 passed, 1 failed +❌ 1 pre-deployment check(s) failed. +Please address the issues above before proceeding with deployment. +``` + +### When all checks pass: +``` +======================================== + Dynamo Pre-Deployment Check Script +======================================== + + +--- Checking kubectl connectivity --- +✅ kubectl is available and cluster is accessible + +--- Checking for default StorageClass --- +✅ Default StorageClass found + - NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE +my-default-storage-class (default) compute.csi.mock Delete WaitForFirstConsumer true 65d + +--- Checking cluster gpu resources --- +✅ Found 17 gpu node(s) in the cluster +Node information: + + +--- Pre-Deployment Check Summary --- +✅ kubectl Connectivity: PASSED +✅ Default StorageClass: PASSED +✅ Cluster Resources: PASSED + +Summary: 3 passed, 0 failed +🎉 All pre-deployment checks passed! +Your cluster is ready for Dynamo deployment. +``` + +## Check Status Summary + +The script provides a comprehensive summary showing the status of each check: + +| Check Name | Description | Pass/Fail Status | +|------------|-------------|------------------| +| **kubectl Connectivity** | Verifies kubectl installation and cluster access | ✅ PASSED / ❌ FAILED | +| **Default StorageClass** | Checks for default StorageClass annotation | ✅ PASSED / ❌ FAILED | +| **Cluster Resources** | Validates GPU nodes availability | ✅ PASSED / ❌ FAILED | + +## Setting a Default StorageClass + +If you need to set a default StorageClass, use the following command: + +```bash +kubectl patch storageclass -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' +``` + +Replace `` with the name of your desired StorageClass. + +## Troubleshooting + +### Multiple Default StorageClasses +If you have multiple StorageClasses marked as default, the script will warn you: +``` +⚠️ Warning: Multiple default StorageClasses detected + This may cause unpredictable behavior. Consider having only one default StorageClass. +``` + +To remove the default annotation from a StorageClass: +```bash +kubectl patch storageclass -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' +``` + +### No GPU Nodes Found +If no GPU nodes are found, ensure your cluster has nodes with the `nvidia.com/gpu.present=true` label. + +### No StorageClasses Available +If no StorageClasses are available in your cluster, you'll need to: +1. Install a storage provisioner (e.g., for cloud providers, local storage, etc.) +2. Create appropriate StorageClass resources +3. Mark one as default + +## Reference + +For more information on managing default StorageClasses, visit: +[Kubernetes Documentation - Change the default StorageClass](https://kubernetes.io/docs/tasks/administer-cluster/change-default-storage-class/) \ No newline at end of file diff --git a/deploy/cloud/pre-deployment/nixl/README.md b/deploy/cloud/pre-deployment/nixl/README.md new file mode 100644 index 000000000000..567940ff2867 --- /dev/null +++ b/deploy/cloud/pre-deployment/nixl/README.md @@ -0,0 +1,297 @@ + + +# NIXL Benchmark Documentation + +This guide describes how to build and deploy the NIXL benchmark using the provided scripts on a Kubernetes (K8s) cluster. + +> **Note**: NIXL benchmark is part of the Dynamo platform. Before proceeding, ensure your cluster meets the basic Dynamo requirements by running the pre-deployment check script located in the parent directory (`../pre-deployment-check.sh`). + +--- + +## Prerequisites + +### Cluster Requirements +Before deploying NIXL benchmark, ensure your cluster meets the Dynamo platform requirements by running the pre-deployment check: + +```bash +# Run from the parent directory +../pre-deployment-check.sh +``` + +This script verifies: +- `kubectl` connectivity and cluster access +- GPU nodes availability (`nvidia.com/gpu.present=true` label) +- GPU Operator installation and status + +### NIXL-Specific Requirements +In addition to the cluster requirements above, NIXL benchmark requires: +- **Docker** installed and configured on your local machine (for building images) +- **Docker registry access** to push the built nixlbench images +- **ETCD service** deployed and accessible as `etcd:2379` +- **Build utilities**: `wget` and `unzip` for downloading NIXL source code + +### Verification Steps +1. **Run pre-deployment check** (recommended): + ```bash + ../pre-deployment-check.sh + ``` + Ensure all checks pass before proceeding. + +2. **Verify ETCD availability** (NIXL-specific): + ```bash + kubectl get svc etcd + ``` + +3. **Confirm Docker registry access**: + ```bash + docker login your-registry.com # if using private registry + ``` + +--- + +## Quick Start + +For the easiest experience, use the interactive build and deploy script: + +```bash +./build_and_deploy.sh +``` + +This script provides a flexible workflow where you can: +1. **Select architecture**: Choose between x86_64 (Intel/AMD 64-bit) or aarch64 (ARM64) +2. **Choose which steps to execute**: Select any combination of: + - Build nixlbench Docker image + - Update deployment YAML file + - Deploy to Kubernetes +3. **Provide Docker registry** (only when needed for building or updating deployment) + +--- + +## Interactive Script Features + +### Architecture Selection +The script supports two architectures: +- **Option 1**: x86_64 (Intel/AMD 64-bit) +- **Option 2**: aarch64 (ARM64) + +You can select by entering: +- `1` or `x86_64` for x86_64 architecture +- `2` or `aarch64` for aarch64 architecture + +### Step Selection +Choose which steps to execute by entering comma-separated numbers: + +- **All steps**: `1,2,3` +- **Build and update only**: `1,2` (skips Kubernetes deployment) +- **Deploy only**: `3` (useful if image is already built and deployment file exists) +- **Build only**: `1` (useful for just creating the Docker image) +- **Update deployment only**: `2` (useful for updating deployment file with new registry/version) + +### Smart Registry Prompting +The script only prompts for Docker registry information when needed: +- **Steps 1 or 2**: Registry required for building image or updating deployment +- **Step 3 only**: No registry prompt (uses existing deployment file) + +--- + +## What Each Step Does + +### Step 1: Build nixlbench Docker Image +- Downloads NIXL source code (version 0.6.0) from GitHub +- Extracts and navigates to the build directory +- Pauses for user confirmation before building +- Builds Docker image with specified registry and architecture +- Tags image as: `{registry}/nixlbench:0.6.0-{arch}` + +### Step 2: Update Deployment YAML File +- Copies base deployment template (`nixlbench-deployment.yaml`) +- Creates architecture-specific deployment file (`nixlbench-deployment-{arch}.yaml`) +- Updates image reference with your registry and architecture +- Preserves all other deployment configurations + +### Step 3: Deploy to Kubernetes +- Validates deployment file exists +- Applies deployment to Kubernetes cluster +- Provides monitoring commands for checking status + +--- + +## Deployment Configuration + +The deployment creates: +- **2 replicas** of the nixlbench pod +- **Resource requests/limits**: + - CPU: 10 cores + - Memory: 5Gi + - GPU: 1 NVIDIA GPU per pod +- **Environment variables**: + - `ETCD_ENDPOINTS`: Points to `etcd:2379` +- **Command**: Runs nixlbench with VRAM segments and keeps container alive + +--- + +## Usage Examples + +### Example 1: Complete Workflow +```bash +./build_and_deploy.sh +# Select: 1 (x86_64) +# Steps: 1,2,3 +# Registry: docker.io/myusername +# Confirm: y +``` + +### Example 2: Build Image Only +```bash +./build_and_deploy.sh +# Select: 2 (aarch64) +# Steps: 1 +# Registry: my-private-registry.com +# Confirm: y +``` + +### Example 3: Deploy Existing Image +```bash +./build_and_deploy.sh +# Select: 1 (x86_64) +# Steps: 3 +# Confirm: y +``` + +### Example 4: Update Deployment File Only +```bash +./build_and_deploy.sh +# Select: 2 (aarch64) +# Steps: 2 +# Registry: new-registry.com +# Confirm: y +``` + +--- + +## Generated Files + +The script creates architecture-specific deployment files: +- `nixlbench-deployment-x86_64.yaml` - For x86_64 builds +- `nixlbench-deployment-aarch64.yaml` - For aarch64 builds + +These files are customized versions of the base template with your specific: +- Docker registry +- Image tag +- Architecture + +--- + +## Monitoring Your Deployment + +After deployment, monitor your NIXL benchmark: + +```bash +# Check pod status +kubectl get pods -l app=nixl-benchmark + +# View logs +kubectl logs -l app=nixl-benchmark -f + +# Check resource usage +kubectl top pods -l app=nixl-benchmark + +# Get detailed pod information +kubectl describe pods -l app=nixl-benchmark +``` + +If deployed to a specific namespace: +```bash +kubectl get pods -l app=nixl-benchmark -n your-namespace +kubectl logs -l app=nixl-benchmark -f -n your-namespace +``` + +--- + + +## Troubleshooting + +### Cluster-Level Issues +For cluster-related problems, first run the pre-deployment check to identify issues: + +```bash +../pre-deployment-check.sh +``` + +This will help diagnose: +- kubectl connectivity problems +- Missing default StorageClass +- GPU node availability issues +- GPU Operator status problems + +### NIXL-Specific Issues + +1. **ETCD Connection**: + - Ensure etcd service is running: `kubectl get svc dynamo-platform-etcd` + - Verify etcd endpoints are accessible from pods + - Check if etcd is in the correct namespace + +2. **Image Pull Issues**: + - Verify registry credentials are configured + - Check image exists: `docker pull {registry}/nixlbench:0.6.0-{arch}` + - Ensure image was pushed successfully after build + +3. **Build Failures**: + - Ensure Docker daemon is running + - Check available disk space in `/tmp` + - Verify network connectivity to GitHub + - Confirm build utilities are installed: `which wget unzip` + +4. **Deployment File Not Found**: + - Run step 2 to create deployment file before step 3 + - Check file permissions in script directory + - Verify script directory path is correct + +### Debug Commands +```bash +# Check script-generated files +ls -la nixlbench-deployment-*.yaml + +# Verify deployment status +kubectl get deployment nixl-benchmark -o yaml + +# Check events for issues +kubectl get events --sort-by=.metadata.creationTimestamp +``` + +### Cleanup + +To remove the deployment: +```bash +kubectl delete deployment nixl-benchmark +``` + +Or if deployed to a specific namespace: +```bash +kubectl delete deployment nixl-benchmark -n your-namespace +``` + +To clean up generated files: +```bash +rm -f nixlbench-deployment-*.yaml +``` + +--- + +## Script Reference + +### build_and_deploy.sh +Interactive script that provides flexible build and deployment workflow: +- **Architecture selection**: x86_64 or aarch64 +- **Step selection**: Choose any combination of build, update, deploy +- **Validation**: Checks for deployment files before deploying + +### nixlbench-deployment.yaml +Base Kubernetes deployment template that gets customized by the script: +- **Template image**: `my-registry/nixlbench:version-arch` +- **Resource allocation**: 10 CPU, 5Gi memory, 1 GPU per pod +- **ETCD integration**: Pre-configured environment variables +- **Benchmark command**: Runs with VRAM segment configuration \ No newline at end of file diff --git a/deploy/cloud/pre-deployment/nixl/build_and_deploy.sh b/deploy/cloud/pre-deployment/nixl/build_and_deploy.sh new file mode 100755 index 000000000000..88f966a61bad --- /dev/null +++ b/deploy/cloud/pre-deployment/nixl/build_and_deploy.sh @@ -0,0 +1,413 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + + +NIXL_VERSION="0.6.0" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Function to check Docker daemon status +check_docker_daemon() { + if ! docker info >/dev/null 2>&1; then + return 1 + fi + return 0 +} + +# Function to check all required dependencies +check_dependencies() { + echo "Checking required dependencies..." + local missing_deps=() + local warnings=() + + # Check wget + if ! command_exists wget; then + missing_deps+=("wget") + else + echo "✅ wget is available" + fi + + # Check unzip + if ! command_exists unzip; then + missing_deps+=("unzip") + else + echo "✅ unzip is available" + fi + + # Check kubectl + if ! command_exists kubectl; then + missing_deps+=("kubectl") + else + echo "✅ kubectl is available" + # Test kubectl connectivity + if ! kubectl cluster-info >/dev/null 2>&1; then + warnings+=("kubectl is installed but cannot connect to cluster") + else + echo "✅ kubectl can connect to cluster" + fi + fi + + # Check Docker + if ! command_exists docker; then + missing_deps+=("docker") + else + echo "✅ docker is available" + # Check Docker daemon + if ! check_docker_daemon; then + warnings+=("Docker is installed but daemon is not running or accessible") + else + echo "✅ Docker daemon is running" + + # Additional Docker toolchain checks + if ! docker ps >/dev/null 2>&1; then + warnings+=("Docker requires sudo or user is not in docker group - consider adding user to docker group") + fi + + if ! docker buildx version >/dev/null 2>&1; then + warnings+=("Docker buildx not available (may affect multi-architecture builds)") + fi + fi + fi + + # Report missing dependencies + if [ ${#missing_deps[@]} -gt 0 ]; then + echo + echo "❌ Missing required dependencies:" + for dep in "${missing_deps[@]}"; do + echo " - $dep" + done + echo + echo "Please install the missing dependencies and try again." + echo + echo "Installation suggestions:" + for dep in "${missing_deps[@]}"; do + case "$dep" in + wget) + echo " wget: sudo apt-get install wget (Ubuntu/Debian) or yum install wget (RHEL/CentOS)" + ;; + unzip) + echo " unzip: sudo apt-get install unzip (Ubuntu/Debian) or yum install unzip (RHEL/CentOS)" + ;; + kubectl) + echo " kubectl: https://kubernetes.io/docs/tasks/tools/install-kubectl/" + ;; + docker) + echo " docker: https://docs.docker.com/get-docker/" + ;; + esac + done + return 1 + fi + + # Report warnings + if [ ${#warnings[@]} -gt 0 ]; then + echo + echo "⚠️ Warnings:" + for warning in "${warnings[@]}"; do + echo " - $warning" + done + echo + printf "Do you want to continue despite these warnings? (y/N): " + read continue_with_warnings + case "$continue_with_warnings" in + [Yy]|[Yy][Ee][Ss]) + echo "Continuing with warnings..." + ;; + *) + echo "Please resolve the warnings and try again." + return 1 + ;; + esac + fi + + echo "✅ All required dependencies are available" + return 0 +} + +# Function to display available architectures +show_architectures() { + echo "Available architectures:" + echo "1) x86_64 (Intel/AMD 64-bit)" + echo "2) aarch64 (ARM64)" +} + +# Function to validate architecture input +validate_architecture() { + local arch=$1 + case $arch in + 1|x86_64) + echo "x86_64" + return 0 + ;; + 2|aarch64) + echo "aarch64" + return 0 + ;; + *) + return 1 + ;; + esac +} + +# Function to prompt for registry +prompt_for_registry() { + echo + printf "Enter your Docker registry (e.g., my-registry, docker.io/username): " + read REGISTRY + if [ -z "$REGISTRY" ]; then + echo "Error: Registry cannot be empty" + exit 1 + fi +} + +# Function to build nixlbench image +build_nixlbench() { + local arch=$1 + local registry=$2 + + echo "Building nixlbench image for architecture: $arch" + echo "Registry: $registry" + + NIXL_BUILD_DIR="/tmp/nixlbench-${NIXL_VERSION}" + rm -rf "${NIXL_BUILD_DIR}" + mkdir -p "${NIXL_BUILD_DIR}" + cd "${NIXL_BUILD_DIR}" + + echo "Downloading NIXL source..." + wget https://github.com/ai-dynamo/nixl/archive/refs/tags/${NIXL_VERSION}.zip + unzip "${NIXL_VERSION}.zip" + cd "nixl-${NIXL_VERSION}/benchmark/nixlbench/contrib" + read -p "Press Enter to continue" + echo "Building Docker image..." + ./build.sh --tag "${registry}/nixlbench:${NIXL_VERSION}-${arch}" --arch "${arch}" + + echo "Build completed successfully!" + echo "Image: ${registry}/nixlbench:${NIXL_VERSION}-${arch}" +} + +# Function to update deployment yaml +update_deployment() { + local arch=$1 + local registry=$2 + local deployment_file="${SCRIPT_DIR}/nixlbench-deployment-${arch}.yaml" + + echo "Creating deployment file: $deployment_file" + + # Copy the original deployment file and update the image + cp "${SCRIPT_DIR}/nixlbench-deployment.yaml" "$deployment_file" + + # Update the image field using sed + sed -i "s|my-registry/nixlbench:version-arch|${registry}/nixlbench:${NIXL_VERSION}-${arch}|g" "$deployment_file" + + echo "Deployment file updated with image: ${registry}/nixlbench:${NIXL_VERSION}-${arch}" +} + +# Function to prompt for steps to execute +prompt_for_steps() { + echo + echo "Select which steps to execute:" + echo "1) Build nixlbench Docker image" + echo "2) Update deployment YAML file" + echo "3) Deploy to Kubernetes" + echo + echo "Enter the steps you want to execute (e.g., '1,2,3' for all, '1,2' to skip deployment, '3' for deployment only):" + printf "Steps to execute: " + read steps_input + + if [ -z "$steps_input" ]; then + echo "Error: Please select at least one step" + return 1 + fi + + # Parse the input and set flags + EXECUTE_BUILD=false + EXECUTE_UPDATE=false + EXECUTE_DEPLOY=false + + # Convert comma-separated input to array + IFS=',' read -ra STEPS <<< "$steps_input" + for step in "${STEPS[@]}"; do + # Remove whitespace + step=$(echo "$step" | tr -d ' ') + case "$step" in + 1) + EXECUTE_BUILD=true + ;; + 2) + EXECUTE_UPDATE=true + ;; + 3) + EXECUTE_DEPLOY=true + ;; + *) + echo "Warning: Invalid step '$step' ignored. Valid steps are 1, 2, 3" + ;; + esac + done + + # Check if at least one valid step was selected + if [ "$EXECUTE_BUILD" = false ] && [ "$EXECUTE_UPDATE" = false ] && [ "$EXECUTE_DEPLOY" = false ]; then + echo "Error: No valid steps selected" + return 1 + fi + + return 0 +} + +# Function to deploy to Kubernetes +deploy_to_k8s() { + local arch=$1 + local deployment_file="${SCRIPT_DIR}/nixlbench-deployment-${arch}.yaml" + + echo "Deploying to Kubernetes..." + kubectl apply -f "$deployment_file" + echo "Deployment applied successfully!" + echo + echo "To check the status of your deployment:" + echo "kubectl get pods -l app=nixl-benchmark" + echo + echo "To view logs:" + echo "kubectl logs -l app=nixl-benchmark -f" +} + +# Main script +main() { + echo "NIXL Benchmark Build and Deploy Script" + echo "======================================" + echo + + # Check dependencies first + if ! check_dependencies; then + exit 1 + fi + echo + + # Show available architectures + show_architectures + echo + + # Prompt for architecture + while true; do + printf "Select architecture (1-2 or enter x86_64/aarch64): " + read arch_input + + if [ -z "$arch_input" ]; then + echo "Error: Please select an architecture" + continue + fi + + SELECTED_ARCH=$(validate_architecture "$arch_input") + if [ $? -eq 0 ]; then + break + else + echo "Error: Invalid architecture. Please select 1, 2, x86_64, or aarch64" + fi + done + + echo "Selected architecture: $SELECTED_ARCH" + + # Prompt for registry (only if building or updating deployment) + REGISTRY="" + + # Prompt for steps to execute + while true; do + if prompt_for_steps; then + break + fi + echo "Please try again." + echo + done + + # Only prompt for registry if we need it + if [ "$EXECUTE_BUILD" = true ] || [ "$EXECUTE_UPDATE" = true ]; then + prompt_for_registry + fi + + echo + echo "Summary:" + echo "- Architecture: $SELECTED_ARCH" + if [ -n "$REGISTRY" ]; then + echo "- Registry: $REGISTRY" + echo "- Image will be: $REGISTRY/nixlbench:$NIXL_VERSION-$SELECTED_ARCH" + fi + echo "- Steps to execute:" + if [ "$EXECUTE_BUILD" = true ]; then + echo " ✓ Build nixlbench Docker image" + else + echo " ✗ Build nixlbench Docker image (skipped)" + fi + if [ "$EXECUTE_UPDATE" = true ]; then + echo " ✓ Update deployment YAML file" + else + echo " ✗ Update deployment YAML file (skipped)" + fi + if [ "$EXECUTE_DEPLOY" = true ]; then + echo " ✓ Deploy to Kubernetes" + else + echo " ✗ Deploy to Kubernetes (skipped)" + fi + echo + + printf "Proceed with selected steps? (y/N): " + read confirm + case "$confirm" in + [Yy]|[Yy][Ee][Ss]) + ;; + *) + echo "Process cancelled." + exit 0 + ;; + esac + + # Execute selected steps + if [ "$EXECUTE_BUILD" = true ]; then + echo + echo "=== Building nixlbench Docker image ===" + build_nixlbench "$SELECTED_ARCH" "$REGISTRY" + fi + + if [ "$EXECUTE_UPDATE" = true ]; then + echo + echo "=== Updating deployment YAML file ===" + update_deployment "$SELECTED_ARCH" "$REGISTRY" + fi + + if [ "$EXECUTE_DEPLOY" = true ]; then + echo + echo "=== Deploying to Kubernetes ===" + # Check if deployment file exists + deployment_file="${SCRIPT_DIR}/nixlbench-deployment-${SELECTED_ARCH}.yaml" + if [ ! -f "$deployment_file" ]; then + echo "Warning: Deployment file not found at $deployment_file" + echo "You may need to run step 2 (Update deployment YAML file) first." + printf "Do you want to continue with deployment anyway? (y/N): " + read deploy_confirm + case "$deploy_confirm" in + [Yy]|[Yy][Ee][Ss]) + ;; + *) + echo "Deployment skipped." + EXECUTE_DEPLOY=false + ;; + esac + fi + + if [ "$EXECUTE_DEPLOY" = true ]; then + deploy_to_k8s "$SELECTED_ARCH" + fi + fi + + echo + echo "Process completed successfully!" +} + +# Run main function +main "$@" diff --git a/benchmarks/nixl/nixl-benchmark-deployment.yaml b/deploy/cloud/pre-deployment/nixl/nixlbench-deployment.yaml similarity index 54% rename from benchmarks/nixl/nixl-benchmark-deployment.yaml rename to deploy/cloud/pre-deployment/nixl/nixlbench-deployment.yaml index b0bf1084ac20..15cd39431555 100644 --- a/benchmarks/nixl/nixl-benchmark-deployment.yaml +++ b/deploy/cloud/pre-deployment/nixl/nixlbench-deployment.yaml @@ -14,16 +14,22 @@ spec: labels: app: nixl-benchmark spec: - imagePullSecrets: - - name: nvcr-imagepullsecret containers: - name: nixl-benchmark - image: my-registry/vllm-runtime:nixlbench-e42c07a8 + image: "my-registry/nixlbench:version-arch" command: ["sh", "-c"] + env: + - name: ETCD_ENDPOINTS + value: etcd:2379 args: - - "nixlbench -etcd_endpoints http://dynamo-platform-etcd:2379 --target_seg_type VRAM --initiator_seg_type VRAM && sleep infinity" + - | + nixlbench -etcd_endpoints ${ETCD_ENDPOINTS} --target_seg_type VRAM --initiator_seg_type VRAM && sleep infinity resources: requests: - nvidia.com/gpu: "1" + cpu: "10" + memory: "5Gi" + nvidia.com/gpu: "1" limits: - nvidia.com/gpu: "1" + cpu: "10" + memory: "5Gi" + nvidia.com/gpu: "1" diff --git a/deploy/cloud/pre-deployment/pre-deployment-check.sh b/deploy/cloud/pre-deployment/pre-deployment-check.sh new file mode 100755 index 000000000000..3477718b1998 --- /dev/null +++ b/deploy/cloud/pre-deployment/pre-deployment-check.sh @@ -0,0 +1,283 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Pre-deployment check script for Dynamo +# This script verifies that the Kubernetes cluster has the necessary prerequisites +# before deploying Dynamo platform. +# +# Checks performed: +# 1. kubectl connectivity - Verifies kubectl is installed and can connect to cluster +# 2. Default StorageClass - Ensures a default StorageClass is configured +# 3. Cluster GPU Resources - Validates GPU nodes are available +# 4. GPU Operator - Confirms GPU operator is installed and running + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + local color=$1 + local message=$2 + echo -e "${color}${message}${NC}" +} + +print_header() { + echo -e "\n${BLUE}========================================${NC}" + echo -e "${BLUE} Dynamo Pre-Deployment Check Script ${NC}" + echo -e "${BLUE}========================================${NC}\n" +} + +print_section() { + echo -e "\n${BLUE}--- $1 ---${NC}" +} + +# Function to check if kubectl is available and cluster is accessible +check_kubectl() { + print_section "Checking kubectl connectivity" + + if ! command -v kubectl &> /dev/null; then + print_status $RED "❌ kubectl is not installed or not in PATH" + print_status $YELLOW "Please install kubectl and ensure it's in your PATH" + return 1 + fi + + if ! kubectl cluster-info &> /dev/null; then + print_status $RED "❌ Cannot connect to Kubernetes cluster" + print_status $YELLOW "Please ensure kubectl is configured to connect to your cluster" + return 1 + fi + + print_status $GREEN "✅ kubectl is available and cluster is accessible" + return 0 +} + +# Function to check for default storage class +check_default_storage_class() { + print_section "Checking for default StorageClass" + + # Use JSONPath to find storage classes with the default annotation set to "true" + local default_storage_classes + default_storage_classes=$(kubectl get storageclass -o jsonpath='{range .items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")]}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "") + + if [[ -z "$default_storage_classes" ]]; then + print_status $RED "❌ No default StorageClass found" + print_status $YELLOW "\nDynamo requires a default StorageClass for persistent volume provisioning." + print_status $BLUE "Please follow the instructions below to configure a default StorageClass before proceeding with deployment.\n" + + # Show available storage classes + print_status $BLUE "Available StorageClasses in your cluster:" + local all_storage_classes + all_storage_classes=$(kubectl get storageclass 2>/dev/null || echo "") + + if [[ -z "$all_storage_classes" ]]; then + print_status $YELLOW " No StorageClasses found in the cluster" + else + echo -e "$all_storage_classes" + + local all_storage_class_names + all_storage_class_names=$(kubectl get storageclass -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "") + + print_status $BLUE "\nTo set a StorageClass as default, use the following command:" + print_status $YELLOW "kubectl patch storageclass -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'" + + if [[ -n "$all_storage_class_names" ]]; then + local first_sc_name + first_sc_name=$(echo "$all_storage_class_names" | head -n1) + print_status $BLUE "\nExample with your first available StorageClass:" + print_status $YELLOW "kubectl patch storageclass ${first_sc_name} -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'" + fi + fi + + print_status $BLUE "\nFor more information on managing default StorageClasses, visit:" + print_status $BLUE "https://kubernetes.io/docs/tasks/administer-cluster/change-default-storage-class/" + return 1 + else + print_status $GREEN "✅ Default StorageClass found" + while IFS= read -r sc_name; do + if [[ -n "$sc_name" ]]; then + local provisioner + default_sc=$(kubectl get storageclass "$sc_name" 2>/dev/null || echo "unknown") + print_status $GREEN " - ${default_sc}" + fi + done <<< "$default_storage_classes" + + # Check if there are multiple default storage classes (which can cause issues) + local default_count + default_count=$(echo "$default_storage_classes" | grep -c . || echo "0") + if [[ $default_count -gt 1 ]]; then + print_status $YELLOW "⚠️ Warning: Multiple default StorageClasses detected" + print_status $YELLOW " This may cause unpredictable behavior. Consider having only one default StorageClass." + fi + return 0 + fi +} + +check_cluster_resources() { + print_section "Checking cluster GPU resources" + + local node_count + node_count=$(kubectl get nodes -l nvidia.com/gpu.present=true -o name 2>/dev/null | wc -l || echo "0") + + if [[ $node_count -eq 0 ]]; then + print_status $RED "❌ No GPU nodes found in the cluster" + print_status $YELLOW "Dynamo requires nodes with nvidia.com/gpu.present=true label." + print_status $BLUE "Please ensure your cluster has GPU-enabled nodes properly labeled." + return 1 + else + print_status $GREEN "✅ Found ${node_count} GPU node(s) in the cluster" + return 0 + fi + + # Show basic node information (commented out for cleaner output) + # print_status $BLUE "GPU Node information:" + # kubectl get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].type,ROLES:.metadata.labels.'node-role\.kubernetes\.io/.*',VERSION:.status.nodeInfo.kubeletVersion 2>/dev/null || true +} + +check_gpu_operator() { + print_section "Checking GPU operator" + + # Check if GPU operator pods exist and are running + local gpu_operator_pods + gpu_operator_pods=$(kubectl get pods -A -lapp=gpu-operator --no-headers 2>/dev/null || echo "") + + if [[ -z "$gpu_operator_pods" ]]; then + print_status $RED "❌ GPU operator not found in the cluster" + print_status $YELLOW "Dynamo requires GPU operator to be installed and running." + print_status $BLUE "Please install GPU operator before proceeding with deployment." + return 1 + fi + + # Check if any GPU operator pods are running + local running_pods + running_pods=$(echo "$gpu_operator_pods" | grep -c "Running" || echo "0") + local total_pods + total_pods=$(echo "$gpu_operator_pods" | wc -l) + + if [[ $running_pods -eq 0 ]]; then + print_status $RED "❌ GPU operator pods are not running" + print_status $YELLOW "Found $total_pods GPU operator pod(s) but none are in Running state:" + echo "$gpu_operator_pods" + return 1 + elif [[ $running_pods -lt $total_pods ]]; then + print_status $YELLOW "⚠️ GPU operator partially running: $running_pods/$total_pods pods running" + echo "$gpu_operator_pods" + print_status $GREEN "✅ GPU operator is available (with warnings)" + return 0 + else + print_status $GREEN "✅ GPU operator is running ($running_pods/$total_pods pods)" + return 0 + fi +} + +# Global variables to track check results (using simple arrays for compatibility) +CHECK_RESULTS="" +CHECK_ORDER="" + +# Function to record check result +record_check_result() { + local check_name="$1" + local status="$2" + + # Append to results string with delimiter + if [[ -z "$CHECK_RESULTS" ]]; then + CHECK_RESULTS="${check_name}:${status}" + CHECK_ORDER="${check_name}" + else + CHECK_RESULTS="${CHECK_RESULTS}|${check_name}:${status}" + CHECK_ORDER="${CHECK_ORDER}|${check_name}" + fi +} + +# Function to get check result by name +get_check_result() { + local check_name="$1" + echo "$CHECK_RESULTS" | tr '|' '\n' | grep "^${check_name}:" | cut -d':' -f2 +} + +# Function to display check summary +display_check_summary() { + print_section "Pre-Deployment Check Summary" + + local passed=0 + local failed=0 + + # Split CHECK_ORDER by delimiter and iterate + IFS='|' read -ra CHECKS <<< "$CHECK_ORDER" + for check_name in "${CHECKS[@]}"; do + local status=$(get_check_result "$check_name") + if [[ "$status" == "PASS" ]]; then + print_status $GREEN "✅ $check_name: PASSED" + ((passed++)) + else + print_status $RED "❌ $check_name: FAILED" + ((failed++)) + fi + done + + echo "" + print_status $BLUE "Summary: $passed passed, $failed failed" + + if [[ $failed -eq 0 ]]; then + print_status $GREEN "🎉 All pre-deployment checks passed!" + print_status $GREEN "Your cluster is ready for Dynamo deployment." + return 0 + else + print_status $RED "❌ $failed pre-deployment check(s) failed." + print_status $RED "Please address the issues above before proceeding with deployment." + return 1 + fi +} + +# Main execution +main() { + print_header + + local overall_exit_code=0 + + # Run checks and capture results + if check_kubectl; then + record_check_result "kubectl Connectivity" "PASS" + else + record_check_result "kubectl Connectivity" "FAIL" + overall_exit_code=1 + fi + + if check_default_storage_class; then + record_check_result "Default StorageClass" "PASS" + else + record_check_result "Default StorageClass" "FAIL" + overall_exit_code=1 + fi + + if check_cluster_resources; then + record_check_result "Cluster GPU Resources" "PASS" + else + record_check_result "Cluster GPU Resources" "FAIL" + overall_exit_code=1 + fi + + if check_gpu_operator; then + record_check_result "GPU Operator" "PASS" + else + record_check_result "GPU Operator" "FAIL" + overall_exit_code=1 + fi + + # Display summary + echo "" + if ! display_check_summary; then + overall_exit_code=1 + fi + + exit $overall_exit_code +} + +# Run the script +main "$@" diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index 5cbac1dc432d..c7ffb22d4b3d 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -19,6 +19,11 @@ limitations under the License. High-level guide to Dynamo Kubernetes deployments. Start here, then dive into specific guides. +## Pre-deployment Checks + +Before deploying the platform, it is recommended to run the pre-deployment checks to ensure the cluster is ready for deployment. Please refer to the [pre-deployment checks](/deploy/cloud/pre-deployment/README.md) for more details. + + ## 1. Install Platform First ```bash From 165276f5ce1821ceb5f82d49c8d42e3777acae5f Mon Sep 17 00:00:00 2001 From: ishandhanani <82981111+ishandhanani@users.noreply.github.com> Date: Wed, 15 Oct 2025 15:35:20 -0700 Subject: [PATCH 06/26] chore: update sglang container and version (#3647) --- .../sglang/request_handlers/handler_base.py | 2 +- container/Dockerfile.sglang | 2 +- container/Dockerfile.sglang-wideep | 156 ++++++------------ docs/backends/sglang/README.md | 148 ++++++++--------- docs/backends/sglang/dsr1-wideep-gb200.md | 14 +- docs/backends/sglang/dsr1-wideep-h100.md | 90 +++------- docs/backends/sglang/sgl-hicache-example.md | 2 +- docs/backends/sglang/sglang-disaggregation.md | 89 ++++++++++ docs/hidden_toctree.rst | 1 + pyproject.toml | 2 +- 10 files changed, 237 insertions(+), 269 deletions(-) create mode 100644 docs/backends/sglang/sglang-disaggregation.md diff --git a/components/src/dynamo/sglang/request_handlers/handler_base.py b/components/src/dynamo/sglang/request_handlers/handler_base.py index 9f48fcea4a87..4d4472e19a94 100644 --- a/components/src/dynamo/sglang/request_handlers/handler_base.py +++ b/components/src/dynamo/sglang/request_handlers/handler_base.py @@ -189,7 +189,7 @@ async def _cancellation_monitor( Yields: asyncio.Task: The cancellation monitoring task being managed """ - logging.info(f"Creating cancellation monitor task for Context: {context.id()}") + logging.debug(f"Creating cancellation monitor task for Context: {context.id()}") # Start the cancellation monitoring task cancellation_task = asyncio.create_task( diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index 7d8072cf5c9f..e892ffc50b9b 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -14,7 +14,7 @@ ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" # Make sure to update the dependency version in pyproject.toml when updating this -ARG SGLANG_VERSION="0.5.3" +ARG SGLANG_VERSION="0.5.3.post1" # Define general architecture ARGs for supporting both x86 and aarch64 builds. diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep index b81795bd1445..3313929e1616 100644 --- a/container/Dockerfile.sglang-wideep +++ b/container/Dockerfile.sglang-wideep @@ -1,103 +1,68 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -ARG SGLANG_IMAGE_TAG="v0.5.3-cu126" +ARG SGLANG_IMAGE_TAG="v0.5.3.post1" +ARG BRANCH_TYPE -FROM lmsysorg/sglang:${SGLANG_IMAGE_TAG} - -ARG MODE="hopper" -ARG ARCH="amd64" -ARG ARCH_ALT="x86_64" -ARG NIXL_UCX_REF="v1.19.0" -ARG NIXL_TAG="0.5.0" -ARG CMAKE_VERSION="3.31.8" -ARG RUST_VERSION="1.90.0" -ARG CARGO_BUILD_JOBS="16" - -RUN apt-get update -y && \ - apt-get install -y \ - cmake meson ninja-build pybind11-dev patchelf net-tools \ - build-essential protobuf-compiler libssl-dev pkg-config git \ - clang libclang-dev git rapidjson-dev zlib1g-dev jq && \ - pip install --break-system-packages meson-python wheel build - -# Build UCX + NIXL for x86/hopper until its fully tested on GB200 -RUN if [ "$MODE" = "hopper" ]; then \ - apt-get install -y --no-install-recommends \ - libibverbs-dev rdma-core ibverbs-utils libibumad-dev \ - libnuma-dev librdmacm-dev ibverbs-providers autoconf libtool && \ - # UCX from source - rm -rf /opt/hpcx/ucx /usr/local/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && git checkout $NIXL_UCX_REF && \ - ./autogen.sh && \ - ./configure \ - --prefix=/usr/local/ucx \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs \ - --with-efa \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --enable-mt && \ - make -j && make install-strip && ldconfig && \ - # NIXL - git clone https://github.com/ai-dynamo/nixl.git /opt/nixl && \ - cd /opt/nixl && git checkout $NIXL_TAG && \ - pip install --break-system-packages . \ - --config-settings="setup-args=-Ducx_path=/usr/local/ucx"; \ - fi +FROM scratch AS local_src +COPY . /src -ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH +FROM lmsysorg/sglang:${SGLANG_IMAGE_TAG} -# Dynamo WORKDIR /sgl-workspace -COPY . /sgl-workspace/dynamo - -ENV RUSTUP_HOME=/usr/local/rustup \ - CARGO_HOME=/usr/local/cargo \ - PATH=/usr/local/cargo/bin:$PATH - -RUN wget --tries=3 --waitretry=5 \ - "https://static.rust-lang.org/rustup/archive/1.28.1/${ARCH_ALT}-unknown-linux-gnu/rustup-init" && \ - chmod +x rustup-init && \ - ./rustup-init -y \ - --no-modify-path \ - --profile minimal \ - --default-toolchain $RUST_VERSION \ - --default-host ${ARCH_ALT}-unknown-linux-gnu && \ - rm rustup-init && \ - chmod -R a+w $RUSTUP_HOME $CARGO_HOME -ARG CARGO_BUILD_JOBS -ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS} - -RUN cd dynamo && cargo build --release +# Install dynamo +# Providing --build-arg BRANCH_TYPE=local will editable install the local dynamo repo +# Providing --build-arg BRANCH_TYPE=remote will editable install the remote dynamo repo +# Default is to install the latest published dynamo version +ARG BRANCH_TYPE +COPY --from=local_src /src /tmp/local_src +RUN if [ "$BRANCH_TYPE" = "local" ]; then \ + cp -r /tmp/local_src /sgl-workspace/dynamo; \ + elif [ "$BRANCH_TYPE" = "remote" ]; then \ + git clone https://github.com/ai-dynamo/dynamo.git /sgl-workspace/dynamo; \ + fi -RUN cd dynamo/lib/bindings/python && \ +# SGLang does not use a venv in their container +RUN if [ "$BRANCH_TYPE" = "local" ]; then \ + cd dynamo/lib/bindings/python && \ + pip install --break-system-packages maturin && \ + maturin build --release && \ + pip install --break-system-packages target/wheels/*.whl && \ + cd /sgl-workspace/dynamo && \ pip install --break-system-packages -e . && \ + pip install --break-system-packages --requirement /tmp/local_src/container/deps/requirements.txt ; \ + elif [ "$BRANCH_TYPE" = "remote" ]; then \ + cd dynamo/lib/bindings/python && \ + pip install --break-system-packages maturin && \ + maturin build --release && \ + pip install --break-system-packages target/wheels/*.whl && \ cd /sgl-workspace/dynamo && \ - pip install --break-system-packages . - -RUN pip install --break-system-packages sglang-router==0.1.9 - -# Install dependencies -RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ - pip install --break-system-packages --requirement /tmp/requirements.txt - -RUN wget --tries=3 --waitretry=5 \ + pip install --break-system-packages -e . && \ + pip install --break-system-packages --requirement /sgl-workspace/dynamo/container/deps/requirements.txt ; \ + else \ + pip install --break-system-packages ai-dynamo ; \ + fi \ +&& rm -rf /tmp/local_src + +# Install NATS and ETCD +RUN case "$(uname -m)" in \ + x86_64) ARCH=amd64 ;; \ + aarch64) ARCH=arm64 ;; \ + *) echo "Unsupported architecture: $(uname -m)" && exit 1 ;; \ + esac && \ + wget --tries=3 --waitretry=5 \ https://github.com/nats-io/nats-server/releases/download/v2.10.28/\ nats-server-v2.10.28-${ARCH}.deb && \ dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb ENV ETCD_VERSION="v3.5.21" -RUN wget --tries=3 --waitretry=5 \ +RUN case "$(uname -m)" in \ + x86_64) ARCH=amd64 ;; \ + aarch64) ARCH=arm64 ;; \ + *) echo "Unsupported architecture: $(uname -m)" && exit 1 ;; \ + esac && \ + wget --tries=3 --waitretry=5 \ https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/\ etcd-${ETCD_VERSION}-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ mkdir -p /usr/local/bin/etcd && \ @@ -107,29 +72,6 @@ etcd-${ETCD_VERSION}-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ ENV PATH=/usr/local/bin/etcd:$PATH -# GenAI Perf -RUN apt-get purge -y cmake - -RUN mkdir /sgl-workspace/cmake_build && \ - cd /sgl-workspace/cmake_build && \ - wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/\ -cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ - tar -xzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ - mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake && \ - rm cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz - -ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH -RUN cmake --version - -RUN git clone --depth=1 \ - https://github.com/triton-inference-server/perf_analyzer.git && \ - mkdir perf_analyzer/build && \ - cmake -B perf_analyzer/build -S perf_analyzer && \ - cmake --build perf_analyzer/build -- -j$(nproc) - -ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH -RUN pip install --break-system-packages aiperf - # Enable forceful shutdown of inflight requests ENV SGL_FORCE_SHUTDOWN=1 diff --git a/docs/backends/sglang/README.md b/docs/backends/sglang/README.md index 553a3405f9bb..4697b0797cd4 100644 --- a/docs/backends/sglang/README.md +++ b/docs/backends/sglang/README.md @@ -5,8 +5,6 @@ SPDX-License-Identifier: Apache-2.0 # Running SGLang with Dynamo -This directory contains an SGLang component for Dynamo and reference implementations for deploying Large Language Models (LLMs) in various configurations using SGLang. SGLang internally uses ZMQ to communicate between the ingress and the engine processes. For Dynamo, we leverage the runtime to communicate directly with the engine processes and handle ingress and pre/post processing on our end. - ## Use the Latest Release We recommend using the latest stable release of dynamo to avoid breaking changes: @@ -24,6 +22,7 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) ## Table of Contents - [Feature Support Matrix](#feature-support-matrix) - [Dynamo SGLang Integration](#dynamo-sglang-integration) +- [Installation](#installation) - [Quick Start](#quick-start) - [Single Node Examples](#run-single-node-examples) - [Multi-Node and Advanced Examples](#advanced-examples) @@ -40,17 +39,8 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) | [**KV-Aware Routing**](../../architecture/kv_cache_routing.md) | ✅ | | | [**SLA-Based Planner**](../../architecture/sla_planner.md) | ✅ | | | [**Multimodal EPD Disaggregation**](multimodal_epd.md) | ✅ | | -| [**Load Based Planner**](../../architecture/load_planner.md) | ❌ | Planned | | [**KVBM**](../../architecture/kvbm_architecture.md) | ❌ | Planned | -### Large Scale P/D and WideEP Features - -| Feature | SGLang | Notes | -|---------------------|--------|--------------------------------------------------------------| -| **WideEP** | ✅ | Full support on H100s/GB200 | -| **DP Rank Routing** | 🚧 | Direct routing supported. Dynamo KV router does not router to DP worker | -| **GB200 Support** | ✅ | | - ## Dynamo SGLang Integration @@ -65,7 +55,7 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu | Argument | Description | Default | SGLang Equivalent | |----------|-------------|---------|-------------------| | `--endpoint` | Dynamo endpoint in `dyn://namespace.component.endpoint` format | Auto-generated based on mode | N/A | -| `--migration-limit` | Max times a request can migrate between workers | `0` (disabled) | N/A | +| `--migration-limit` | Max times a request can migrate between workers for fault tolerance. See [Request Migration Architecture](../../../docs/architecture/request_migration.md). | `0` (disabled) | N/A | | `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` | | `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` | | `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A | @@ -73,39 +63,33 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu #### Tokenizer Behavior -- **Default (`--use-sglang-tokenizer` not set)**: Dynamo handles tokenization and passes `input_ids` to SGLang -- **With `--use-sglang-tokenizer`**: SGLang handles tokenization, Dynamo passes raw prompts - -> **Note**: When using `--use-sglang-tokenizer`, only `v1/chat/completions` endpoints are available through Dynamo's frontend. - -## SGLang Quick Start - -Below we provide a guide that lets you run all of our common deployment patterns on a single node. - -### Start NATS and ETCD in the background - -Start using [Docker Compose](../../../deploy/docker-compose.yml) +- **Default (`--use-sglang-tokenizer` not set)**: Dynamo handles tokenization/detokenization via our blazing fast frontend and passes `input_ids` to SGLang +- **With `--use-sglang-tokenizer`**: SGLang handles tokenization/detokenization, Dynamo passes raw prompts -```bash -docker compose -f deploy/docker-compose.yml up -d -``` +> [!NOTE] +> When using `--use-sglang-tokenizer`, only `v1/chat/completions` is available through Dynamo's frontend. -### Install `ai-dynamo[sglang]` +## Installation -#### Install latest release +### Install latest release We suggest using uv to install the latest release of ai-dynamo[sglang]. You can install it with `curl -LsSf https://astral.sh/uv/install.sh | sh` +
+Expand for instructions + ```bash # create a virtual env uv venv --python 3.12 --seed -# install the latest release +# install the latest release (which comes bundled with a stable sglang version) uv pip install "ai-dynamo[sglang]" ``` -#### Installing editable version for development +
+ +### Install editable version for development
-Instructions +Expand for instructions This requires having rust installed. We also recommend having a proper installation of the cuda toolkit as sglang requires `nvcc` to be available. @@ -119,40 +103,61 @@ maturin develop --uv cd $DYNAMO_HOME # installs sglang supported version along with dynamo # include the prerelease flag to install flashinfer rc versions -uv pip install --prerelease=allow -e .[sglang] +uv pip install -e . +# install any sglang version >= 0.5.3 +uv pip install "sglang[all]==0.5.3.post1" ```
-#### Using prebuilt docker containers +### Using docker containers
-Instructions +Expand for instructions + +We are in the process of shipping pre-built docker containers that contain installations of DeepEP, DeepGEMM, and NVSHMEM in order to support WideEP and P/D. For now, you can quickly build the container from source with the following command. ```bash -docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.0 +cd $DYNAMO_ROOT +docker build \ + -f container/Dockerfile.sglang-wideep \ + -t dynamo-sglang \ + --no-cache \ + . ``` -
- -#### Building docker container from source - -
-Instructions +And then run it using ```bash -./container/build.sh --framework sglang -# run container using prebuild wheel -./container/run.sh --framework sglang -it -# mount workspace for development -./container/run.sh --framework sglang --mount-workspace +docker run \ + --gpus all \ + -it \ + --rm \ + --network host \ + --shm-size=10G \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + --cap-add CAP_SYS_PTRACE \ + --ipc host \ + dynamo-sglang:latest ```
-## Run Single Node Examples +## Quick Start + +Below we provide a guide that lets you run all of our common deployment patterns on a single node. + +### Start NATS and ETCD in the background -> [!IMPORTANT] +Start using [Docker Compose](../../../deploy/docker-compose.yml) + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + +> [!TIP] > Each example corresponds to a simple bash script that runs the OpenAI compatible server, processor, and optional router (written in Rust) and LLM engine (written in Python) in a single terminal. You can easily take each command and run them in separate terminals. > > Additionally - because we use sglang's argument parser, you can pass in any argument that sglang supports to the worker! @@ -167,15 +172,12 @@ cd $DYNAMO_HOME/components/backends/sglang ### Aggregated Serving with KV Routing -> [!NOTE] -> Until sglang releases a version > v0.5.0rc0, you will have to install from source to use kv_routing. You can do this by running `git clone https://github.com/sgl-project/sglang.git && cd sglang && uv pip install -e "python[all]"`. We will update this section once sglang releases a newer version. - ```bash cd $DYNAMO_HOME/components/backends/sglang ./launch/agg_router.sh ``` -### Aggregated Serving with Embeddings +### Aggregated Serving for Embedding Models Here's an example that uses the [Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B) model. @@ -184,7 +186,8 @@ cd $DYNAMO_HOME/components/backends/sglang ./launch/agg_embed.sh ``` -Send the following request to verify your deployment: +
+Send the following request to verify your deployment: ```bash curl localhost:8000/v1/embeddings \ @@ -195,28 +198,23 @@ curl localhost:8000/v1/embeddings \ }' ``` -### Disaggregated serving - -
-Under the hood: SGLang Load Balancer vs Dynamo Discovery +
-SGLang uses a mini load balancer to route requests to handle disaggregated serving. The load balancer functions as follows: +### Disaggregated serving -1. The load balancer receives a request from the client -2. A random `(prefill, decode)` pair is selected from the pool of available workers -3. Request is sent to both `prefill` and `decode` workers via asyncio tasks -4. Internally disaggregation is done from prefill -> decode +See [SGLang Disaggregation](sglang-disaggregation.md) to learn more about how sglang and dynamo handle disaggregated serving. -Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead, we first route to a random prefill worker, select a random decode worker, and then send the request to both. Internally, SGLang's bootstrap server (which is a part of the `tokenizer_manager`) is used in conjuction with NIXL to handle the kv transfer. -
+```bash +cd $DYNAMO_HOME/components/backends/sglang +./launch/disagg.sh +``` -> [!IMPORTANT] -> Disaggregated serving in SGLang currently requires each worker to have the same tensor parallel size [unless you are using an MLA based model](https://github.com/sgl-project/sglang/pull/5922) +### Disaggregated Serving with KV Aware Prefill Routing ```bash cd $DYNAMO_HOME/components/backends/sglang -./launch/disagg.sh +./launch/disagg_router.sh ``` ### Disaggregated Serving with Mixture-of-Experts (MoE) models and DP attention @@ -229,8 +227,6 @@ cd $DYNAMO_HOME/components/backends/sglang ./launch/disagg_dp_attn.sh ``` -When using MoE models, you can also use the our implementation of the native SGLang endpoints to record expert distribution data. The `disagg_dp_attn.sh` script automatically sets up the SGLang HTTP server, the environment variable that controls the expert distribution recording directory, and sets up the expert distribution recording mode to `stat`. You can learn more about expert parallelism load balancing [here](expert-distribution-eplb.md). - ### Testing the Deployment Send a test request to verify your deployment: @@ -251,16 +247,6 @@ curl localhost:8000/v1/chat/completions \ }' ``` -## Request Migration - -You can enable [request migration](../../../docs/architecture/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker: - -```bash -python3 -m dynamo.sglang ... --migration-limit=3 -``` - -This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../../docs/architecture/request_migration.md) documentation for details on how this works. - ## Advanced Examples Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example! @@ -269,7 +255,7 @@ Below we provide a selected list of advanced examples. Please open up an issue i - **[Run a multi-node model](multinode-examples.md)** ### Large scale P/D disaggregation with WideEP -- **[Run DeepSeek-R1 on 104+ H100s](dsr1-wideep-h100.md)** +- **[Run DeepSeek-R1-FP8 on H100s](dsr1-wideep-h100.md)** - **[Run DeepSeek-R1-FP8 on GB200s](dsr1-wideep-gb200.md)** ### Hierarchical Cache (HiCache) diff --git a/docs/backends/sglang/dsr1-wideep-gb200.md b/docs/backends/sglang/dsr1-wideep-gb200.md index 8cb09df066ce..e7d571cc8f95 100644 --- a/docs/backends/sglang/dsr1-wideep-gb200.md +++ b/docs/backends/sglang/dsr1-wideep-gb200.md @@ -17,21 +17,21 @@ limitations under the License. # Running DeepSeek-R1 Disaggregated with WideEP on GB200s -Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps. In ths example, we will run 1 prefill worker on 2 GB200 nodes (4 GPUs each) and 1 decode worker on 12 GB200 nodes (total 56 GPUs). +Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-wideep` and a sample configuration that demonstrates WideEP and P/D disaggregation. To run the exact configuration shown in the blog post, you can view the commands created by the SGLang team [here](https://github.com/sgl-project/sglang/issues/7227). In this example, we will run 1 prefill worker on 2 GB200 nodes (4 GPUs each) and 1 decode worker on 2 GB200 nodes (total 8 GPUs). ## Instructions -1. Build the Dynamo container +1. Build the Dynamo container using the latest published dynamo version and stable sglang version. If you want to build from a local dynamo repo, you can add `--build-arg BRANCH_TYPE=local` to the build command. If you want to build from a remote dynamo repo, you can add `--build-arg BRANCH_TYPE=remote` to the build command. If you want to use a specific tag for the default sglang version, you can add `--build-arg SGLANG_IMAGE_TAG=` to the build command. + +> [!Note] +> Please ensure that you are building this on an ARM64 machine. The correct SGLang image will be selected automatically via the multi-arch manifest. ```bash cd $DYNAMO_ROOT docker build \ -f container/Dockerfile.sglang-wideep \ -t dynamo-wideep-gb200 \ - --build-arg MODE=blackwell \ - --build-arg SGLANG_IMAGE_TAG=v0.5.3rc0-cu129-gb200 \ - --build-arg ARCH=arm64 \ - --build-arg ARCH_ALT=aarch64 \ + --no-cache \ . ``` @@ -159,4 +159,4 @@ python3 -m dynamo.sglang \ --log-level debug ``` -On the other decode nodes (this example has 12 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +On the other decode nodes (this example has 12 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 \ No newline at end of file diff --git a/docs/backends/sglang/dsr1-wideep-h100.md b/docs/backends/sglang/dsr1-wideep-h100.md index 4ca5c04a213d..a0f2f83ba32a 100644 --- a/docs/backends/sglang/dsr1-wideep-h100.md +++ b/docs/backends/sglang/dsr1-wideep-h100.md @@ -5,19 +5,24 @@ SPDX-License-Identifier: Apache-2.0 # Running DeepSeek-R1 Disaggregated with WideEP on H100s -Dynamo supports SGLang's implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-05-05-large-scale-ep/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-wideep` and configurations to deploy this at scale. In this example, we will run 1 prefill worker on 4 H100 nodes and 1 decode worker on 9 H100 nodes (104 total GPUs). +Dynamo supports SGLang's implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-05-05-large-scale-ep/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-wideep` and a sample configuration that demonstrates WideEP and P/D disaggregation. To run the exact configuration shown in the blog post, you can view the commands created by the SGLang team [here](https://github.com/sgl-project/sglang/issues/6017). In this example, we will run 1 prefill worker on 4 H100 nodes and 1 decode worker on 4 H100 nodes (64 total GPUs). ## Instructions -1. Build the Dynamo container +1. Build the Dynamo container using the latest published dynamo version and stable sglang version. If you want to build from a local dynamo repo, you can add `--build-arg BRANCH_TYPE=local` to the build command. If you want to build from a remote dynamo repo, you can add `--build-arg BRANCH_TYPE=remote` to the build command. If you want to use a specific tag for the default sglang version, you can add `--build-arg SGLANG_IMAGE_TAG=` to the build command. + +> [!Note] +> Please ensure that you are building this on an AMD64 (x86_64) machine. The correct SGLang image will be selected automatically via the multi-arch manifest. ```bash cd $DYNAMO_ROOT -docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache +docker build \ + -f container/Dockerfile.sglang-wideep \ + -t dynamo-wideep \ + --no-cache \ + . ``` -You can use a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) by adding `--build-arg SGLANG_IMAGE_TAG=` to the build command. - 2. You can run this container on each 8xH100 node using the following command. > [!IMPORTANT] @@ -46,8 +51,6 @@ In each container, you should be in the `/sgl-workspace/dynamo/components/backen ```bash # run ingress python3 -m dynamo.frontend --http-port=8000 & -# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below) -python3 utils/sgl_http_server.py --ns dynamo & # run prefill worker python3 -m dynamo.sglang \ --model-path /model/ \ @@ -62,8 +65,9 @@ python3 -m dynamo.sglang \ --tp-size 32 \ --dp-size 32 \ --enable-dp-attention \ - --decode-log-interval 1 \ - --enable-deepep-moe \ + --decode-log-interval 1000 \ + --moe-a2a-backend deepep \ + --load-balance-method round_robin \ --page-size 1 \ --trust-remote-code \ --moe-dense-tp-size 1 \ @@ -92,13 +96,14 @@ python3 -m dynamo.sglang \ --disaggregation-transfer-backend nixl \ --disaggregation-bootstrap-port 30001 \ --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \ - --nnodes 9 \ + --nnodes 4 \ --node-rank 0 \ - --tp-size 72 \ - --dp-size 72 \ + --tp-size 32 \ + --dp-size 32 \ --enable-dp-attention \ - --decode-log-interval 1 \ - --enable-deepep-moe \ + --decode-log-interval 1000 \ + --moe-a2a-backend deepep \ + --prefill-round-robin-balance \ --page-size 1 \ --trust-remote-code \ --moe-dense-tp-size 1 \ @@ -112,59 +117,4 @@ python3 -m dynamo.sglang \ --cuda-graph-bs 128 ``` -On the other decode nodes (this example has 9 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, and 8 - -## Benchmarking - -In the official [blog post repro instructions](https://github.com/sgl-project/sglang/issues/6017), SGL uses batch inference to benchmark their prefill and decode workers. They do this by pretokenizing the ShareGPT dataset and then creating a batch of 8192 requests with ISL 4096 and OSL 5 (for prefill stress test) and a batch of 40000 with ISL 2000 and OSL 100 (for decode stress test). If you want to repro these benchmarks, you will need to add the following flags to the prefill and decode commands: - -prefill: - -```bash -... ---max-running-requests 8192 \ ---max-total-tokens 131072 \ ---context-length 8192 \ ---init-expert-location /configs/prefill_in4096.json \ ---chunked-prefill-size 524288 - -``` - -decode: - -```bash -... ---max-running-requests 18432 \ ---context-length 4500 \ ---init-expert-location /configs/decode_in2000out100.json -``` - -We currently provide 2 different ways to perform an end to end benchmark which includes using our OpenAI frontend and tokenization. We will continue to add better support for these sorts of large single batch workloads in the future. - -1. **GenAI Perf to benchmark end to end performance with 8k ISL 256 OSL** - We've found that 8k ISL 256 OSL provides a good baseline for measuring end to end disaggregated serving performance for DSR1. As WideEP allows for a higher throughput, we provide a script that runs this workload at high concurrencies. DeepGEMM kernels can sometimes take a while to warm up. We provide a short ramping warmup script that can be used. - -Example usage: - -```bash -# warmup -./utils/bench.sh HEAD_PREFILL_NODE_IP --type warmup -# if you ran the http server on the head prefill node, you can optionally flush the kv cache for all workers (similar to SGLangs benchmarking script) -curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache -# run benchmark -./utils/bench.sh HEAD_PREFILL_NODE_IP --type e2e -``` - -2. **GenAI Perf to benchmark completions with custom dataset** - We provide a script that generates a JSONL file of the ShareGPT dataset and then use GenAI Perf to benchmark the prefill and decode workers. We use ShareGPT in order to leverage the pre-existing EPLB distributions provided by the SGLang team. If you don't want to use ShareGPT - you can also use GenAI Perf's synthetic dataset setup But note you will have to use dynamic EPLB configurations or record your own as the `init-expert-location` provided by SGLang is tuned specifically for the ShareGPT dataset at a 4096 ISL and 5 OSL. - -Example usage: - -```bash -# generate data -python3 src/dynamo/sglang/utils/generate_bench_data.py --output data.jsonl --num-prompts 8192 --input-len 4096 --output-len 5 --model deepseek-ai/DeepSeek-R1 -# if you ran the http server on the head prefill node, you can optionally flush the kv cache for all workers (similar to SGLangs benchmarking script) -curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache -# run benchmark -./utils/bench.sh HEAD_PREFILL_NODE_IP --type custom_completions -``` +On the other decode nodes (this example has 4 total decode nodes), run the same command but change `--node-rank` to 1, 2, and 3 diff --git a/docs/backends/sglang/sgl-hicache-example.md b/docs/backends/sglang/sgl-hicache-example.md index ace1f7d025d0..5bb6f8d75604 100644 --- a/docs/backends/sglang/sgl-hicache-example.md +++ b/docs/backends/sglang/sgl-hicache-example.md @@ -55,7 +55,7 @@ curl localhost:8000/v1/chat/completions \ Run the perf script: ```bash -bash -x /workspace/benchmarks/llm/perf.sh \ +bash -x $DYNAMO_ROOT/benchmarks/llm/perf.sh \ --model Qwen/Qwen3-0.6B \ --tensor-parallelism 1 \ --data-parallelism 1 \ diff --git a/docs/backends/sglang/sglang-disaggregation.md b/docs/backends/sglang/sglang-disaggregation.md new file mode 100644 index 000000000000..dc053f18f6cf --- /dev/null +++ b/docs/backends/sglang/sglang-disaggregation.md @@ -0,0 +1,89 @@ + + +# SGLang Disaggregated Serving + +This document explains how SGLang's disaggregated prefill-decode architecture works, both standalone and within Dynamo. + +## Overview + +Disaggregated serving separates the prefill and decode phases of LLM inference into different workers. This architecture allows for: +- Independent scaling of prefill and decode resources +- Better resource utilization (prefill is compute-bound, decode is memory-bound) +- Efficient KV cache transfer between workers using RDMA + +## How Dynamo Integrates with SGLang Disaggregation + +**SGLang's standalone approach:** +1. The load balancer receives a request from the client +2. A random `(prefill, decode)` pair is selected from the pool of available workers +3. Request is sent to both `prefill` and `decode` workers via asyncio tasks +4. Internally disaggregation is done from prefill → decode + +**Dynamo's approach:** + +Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead: +1. Route to a decode worker first +2. Choose a prefill worker via round-robin or KV-aware selection +3. Send the request to both workers +4. SGLang's bootstrap server (part of the `tokenizer_manager`) is used in conjunction with NIXL/Mooncake to handle the KV transfer + +## Disaggregation Flow + +The following diagram shows the complete request flow for disaggregated serving: + +```mermaid +sequenceDiagram + participant Client + participant Decode + participant Prefill + + Note over Decode,Prefill: 0. Setup Phase (One-Time) + Decode->>Prefill: Register RDMA connection info (base GPU memory pointers) + Note over Client,Prefill: Per-Request Phase + Client->>Decode: 1. Send request + Decode->>Prefill: 2. Forward request + get bootstrap_room + Prefill-->>Decode: Return bootstrap_room ID + Note over Decode: 3. Allocate GPU memory for KV cache + Decode->>Prefill: Send allocation info (page indices, metadata buffer) + Note over Prefill: 4. Prefill forward pass + par Decode polls + loop Poll transfer + Note over Decode: 5. Poll for KV arrival + end + and Prefill transfers + Note over Prefill: 6. RDMA write KV to decode + Prefill->>Decode: Transfer KV cache + metadata + end + Note over Prefill: 7. Poll RDMA handles + Note over Prefill: Transfer complete, deallocate metadata + Note over Decode: 8. KV received, start decode + loop Generate tokens + Note over Decode: Decode forward pass + Decode-->>Client: Stream output token + end +``` + +### Key Steps Explained + +**Setup Phase (One-Time)** +- Decode workers register their RDMA connection information with prefill workers +- This includes base GPU memory pointers for direct memory access + +**Per-Request Flow** +1. **Request initiation**: Client sends request to decode worker +2. **Bootstrap room allocation**: Decode forwards to prefill and receives a bootstrap_room ID for coordination +3. **Memory allocation**: Decode allocates GPU memory pages for incoming KV cache +4. **Prefill execution**: Prefill worker processes the prompt and generates KV cache +5. **KV transfer**: Prefill uses RDMA to write KV cache directly to decode's GPU memory (while decode polls for completion) +6. **Cleanup**: Prefill deallocates transfer metadata after confirming completion +7. **Decode phase**: Decode worker generates tokens using the transferred KV cache +8. **Streaming**: Tokens are streamed back to the client as they're generated + +### Performance Characteristics + +- **RDMA transfer**: Zero-copy GPU-to-GPU transfer with minimal CPU involvement +- **Parallel operations**: Decode can poll while prefill transfers data +- **One-time setup**: RDMA connections established once, reused for all requests \ No newline at end of file diff --git a/docs/hidden_toctree.rst b/docs/hidden_toctree.rst index faecb86624a3..d0dedad0b4f4 100644 --- a/docs/hidden_toctree.rst +++ b/docs/hidden_toctree.rst @@ -59,6 +59,7 @@ backends/sglang/gpt-oss.md backends/sglang/multimodal_epd.md backends/sglang/sgl-hicache-example.md + backends/sglang/sglang-disaggregation.md examples/README.md examples/runtime/hello_world/README.md diff --git a/pyproject.toml b/pyproject.toml index 54e08fe7ae45..61e2c32a043c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ vllm = [ sglang = [ "uvloop", "nixl<=0.6.0", - "sglang[all]==0.5.3", + "sglang[all]==0.5.3.post1", ] [dependency-groups] From ec47178641f78f3281a8b72a133b91ec8fb99e8e Mon Sep 17 00:00:00 2001 From: Alec <35311602+alec-flowers@users.noreply.github.com> Date: Wed, 15 Oct 2025 15:37:38 -0700 Subject: [PATCH 07/26] fix: cherrypick cuda 129 (#3652) Signed-off-by: alec-flowers --- .github/actions/docker-build/action.yml | 29 ++++++++++++++++++- .../container-validation-backends.yml | 6 +++- container/Dockerfile.vllm | 4 +++ container/deps/vllm/install_vllm.sh | 6 ++-- 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index 87e7b243271e..e9d3d68351ac 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -34,6 +34,18 @@ inputs: aws_secret_access_key: description: 'AWS Secret Access Key' required: false + base_image_tag: + description: 'Optional override for base image tag passed to build.sh' + required: false + runtime_image_tag: + description: 'Optional override for RUNTIME_IMAGE_TAG build-arg' + required: false + cuda_version: + description: 'Optional override for CUDA_VERSION build-arg' + required: false + torch_backend: + description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)' + required: false outputs: image_tag: @@ -81,6 +93,21 @@ runs: echo "BUILD_START_TIME=${BUILD_START_TIME}" >> $GITHUB_ENV echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT + # Collect optional overrides provided by the workflow + EXTRA_ARGS="" + if [ -n "${{ inputs.base_image_tag }}" ]; then + EXTRA_ARGS+=" --base-image-tag ${{ inputs.base_image_tag }}" + fi + if [ -n "${{ inputs.runtime_image_tag }}" ]; then + EXTRA_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=${{ inputs.runtime_image_tag }}" + fi + if [ -n "${{ inputs.cuda_version }}" ]; then + EXTRA_ARGS+=" --build-arg CUDA_VERSION=${{ inputs.cuda_version }}" + fi + if [ -n "${{ inputs.torch_backend }}" ]; then + EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}" + fi + ./container/build.sh --tag "$IMAGE_TAG" \ --target ${{ inputs.target }} \ --vllm-max-jobs 10 \ @@ -88,7 +115,7 @@ runs: --platform ${{ inputs.platform }} \ --use-sccache \ --sccache-bucket "$SCCACHE_S3_BUCKET" \ - --sccache-region "$AWS_DEFAULT_REGION" + --sccache-region "$AWS_DEFAULT_REGION" $EXTRA_ARGS BUILD_END_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ) echo "🕐 Build ended at: ${BUILD_END_TIME}" diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index ec0b41d9c764..1deef8e5ac1e 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -58,6 +58,10 @@ jobs: framework: vllm target: runtime platform: 'linux/${{ matrix.platform.arch }}' + base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }} + runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }} + cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }} + torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} @@ -251,4 +255,4 @@ jobs: CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }} run: | # Upload complete workflow metrics including container metrics - python3 .github/workflows/upload_complete_workflow_metrics.py \ No newline at end of file + python3 .github/workflows/upload_complete_workflow_metrics.py diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index bb0416ac07db..d2337637953e 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -88,6 +88,10 @@ RUN apt-get update -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# if libmlx5.so not shipped with 24.04 rdma-core packaging, CMAKE will fail when looking for +# generic dev name .so so we symlink .s0.1 -> .so +RUN ln -sf /usr/lib/aarch64-linux-gnu/libmlx5.so.1 /usr/lib/aarch64-linux-gnu/libmlx5.so || true + ### VIRTUAL ENVIRONMENT SETUP ### COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index 195981194991..0ebbb58823fb 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -136,9 +136,9 @@ git checkout $VLLM_REF echo "\n=== Installing vLLM & FlashInfer ===" -if [[ $VLLM_REF =~ ^v ]] && [ "$ARCH" = "amd64" ]; then - # VLLM_REF starts with 'v' and amd64 - use pip install with version tag - echo "Installing vLLM $VLLM_REF from PyPI..." +if [[ $VLLM_REF =~ ^v ]] && { [ "$ARCH" = "amd64" ] || { [ "$ARCH" = "arm64" ] && [ "$TORCH_BACKEND" = "cu129" ]; }; }; then + # VLLM_REF starts with 'v' and either amd64, or arm64 with cu129 backend - use PyPI install + echo "Installing vLLM $VLLM_REF from PyPI... (ARCH=$ARCH, TORCH_BACKEND=$TORCH_BACKEND)" uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND From 1ef8cc1d3b9436c2deaf313d282b77d09d2923a9 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Thu, 16 Oct 2025 14:11:56 -0700 Subject: [PATCH 08/26] fix: update model recipe for llama-3 70b to match with common recipe template #3637 (#3656) --- .../model-cache/model-download.yaml | 18 +- recipes/llama-3-70b/vllm/agg/deploy.yaml | 11 +- recipes/llama-3-70b/vllm/agg/perf.yaml | 159 +++++++++++++----- .../vllm/disagg-multi-node/deploy.yaml | 14 +- .../vllm/disagg-multi-node/perf.yaml | 159 +++++++++++++----- .../vllm/disagg-single-node/deploy.yaml | 14 +- .../vllm/disagg-single-node/perf.yaml | 159 +++++++++++++----- 7 files changed, 385 insertions(+), 149 deletions(-) diff --git a/recipes/llama-3-70b/model-cache/model-download.yaml b/recipes/llama-3-70b/model-cache/model-download.yaml index d8e1dfaa8cb8..1a71923b1559 100644 --- a/recipes/llama-3-70b/model-cache/model-download.yaml +++ b/recipes/llama-3-70b/model-cache/model-download.yaml @@ -22,24 +22,22 @@ spec: - secretRef: name: hf-token-secret env: - # NOTE: This is the model name for the llama-3-70b model - # Update this to model name for the model you are downloading - name: MODEL_NAME value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: HF_TOKEN + - name: HF_HOME + value: /model-store + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: MODEL_REVISION + value: ddb4128556dfcff99e0c41aee159ea6c3e655dcd args: - | set -eux pip install --no-cache-dir huggingface_hub hf_transfer - export HF_HUB_ENABLE_HF_TRANSFER=1 - huggingface-cli download $MODEL_NAME + hf download $MODEL_NAME --revision $MODEL_REVISION volumeMounts: - name: model-cache - mountPath: /root/.cache/huggingface/hub + mountPath: /model-store volumes: - name: model-cache persistentVolumeClaim: diff --git a/recipes/llama-3-70b/vllm/agg/deploy.yaml b/recipes/llama-3-70b/vllm/agg/deploy.yaml index 274b4633901e..09e56ec6a891 100644 --- a/recipes/llama-3-70b/vllm/agg/deploy.yaml +++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml @@ -32,8 +32,13 @@ spec: size: 20Gi extraPodSpec: mainContainer: + env: + - name: SERVED_MODEL_NAME + value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" + - name: MODEL_PATH + value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd" args: - - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" + - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c @@ -42,6 +47,6 @@ spec: replicas: 1 resources: limits: - gpu: "8" + gpu: "4" requests: - gpu: "8" \ No newline at end of file + gpu: "4" \ No newline at end of file diff --git a/recipes/llama-3-70b/vllm/agg/perf.yaml b/recipes/llama-3-70b/vllm/agg/perf.yaml index b750eb709c64..8c5a470f119c 100644 --- a/recipes/llama-3-70b/vllm/agg/perf.yaml +++ b/recipes/llama-3-70b/vllm/agg/perf.yaml @@ -5,7 +5,7 @@ kind: Job metadata: name: llama3-70b-agg-perf spec: - backoffLimit: 3 + backoffLimit: 1 completions: 1 parallelism: 1 template: @@ -16,57 +16,128 @@ spec: restartPolicy: Never containers: - name: perf - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 - workingDir: /workspace/components/backends/vllm command: - /bin/sh - -c - | - # wait for the model to be ready - export ENDPOINT=llama3-70b-agg-0:8000 - export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic - export INTERVAL=5 - echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..." - while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." - sleep $INTERVAL - done - echo "✅ Model '$TARGET_MODEL' is now available!" - curl -s "http://$ENDPOINT/v1/models" | jq . - # now run the benchmark - export ARTIFACT_DIR="/tmp/genai" - mkdir -p "$ARTIFACT_DIR" - echo "Running benchmark..." + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 - aiperf profile \ - --model "$TARGET_MODEL" \ - --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --url "$ENDPOINT" --streaming \ - --concurrency 64 \ - --warmup-request-count 2 \ - --request-count 320 \ - --extra-inputs max_tokens:1024 \ - --synthetic-input-tokens-mean 8192 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 1024 \ - --output-tokens-stddev 0 \ - --extra-inputs min_tokens:1024 \ - --extra-inputs ignore_eos:true \ - --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --random-seed 1418186270 \ - --artifact-dir $ARTIFACT_DIR \ - --num-dataset-entries=3000 -- \ - --max-threads 64 - echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) - cat $PERF_JSON | jq . - echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) - cat $PERF_CSV - echo "Benchmark completed successfully!" + EPOCH=$(date +%s) + ## utility functions -- can be moved to a bash script / configmap + wait_for_model_ready() { + echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." + while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" + sleep 5 + done + echo "✅ Model '$TARGET_MODEL' is now available!" + echo "Model '$TARGET_MODEL' is now available!" + curl -s "http://$ENDPOINT/v1/models" | jq . + } + run_perf() { + local concurrency=$1 + local isl=$2 + local osl=$3 + local max_threads=${concurrency} + key=concurrency_${concurrency} + export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" + mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + aiperf profile --artifact-dir $ARTIFACT_DIR \ + --model $TARGET_MODEL \ + --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ + --endpoint-type chat --endpoint /v1/chat/completions \ + --streaming \ + --url http://$ENDPOINT \ + --synthetic-input-tokens-mean $isl \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean $osl \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:$osl \ + --extra-inputs min_tokens:$osl \ + --extra-inputs ignore_eos:true \ + --extra-inputs repetition_penalty:1.0 \ + --extra-inputs temperature:0.0 \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency $concurrency \ + --request-count $((10*concurrency)) \ + --warmup-request-count $concurrency \ + --conversation-num 12800 \ + --random-seed 100 \ + --workers-max $max_threads \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream'\ + --record-processors 32 \ + --ui simple + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + ls -la $ARTIFACT_DIR + } + #### Actual execution #### + wait_for_model_ready + mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" + # Write input_config.json + cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" </dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." - sleep $INTERVAL - done - echo "✅ Model '$TARGET_MODEL' is now available!" - curl -s "http://$ENDPOINT/v1/models" | jq . - # now run the benchmark - export ARTIFACT_DIR="/tmp/genai" - mkdir -p "$ARTIFACT_DIR" - echo "Running benchmark..." + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 - aiperf profile \ - --model "$TARGET_MODEL" \ - --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --url "$ENDPOINT" --streaming \ - --concurrency 64 \ - --warmup-request-count 2 \ - --request-count 320 \ - --extra-inputs max_tokens:1024 \ - --synthetic-input-tokens-mean 8192 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 1024 \ - --output-tokens-stddev 0 \ - --extra-inputs min_tokens:1024 \ - --extra-inputs ignore_eos:true \ - --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --random-seed 1418186270 \ - --artifact-dir $ARTIFACT_DIR \ - --num-dataset-entries=3000 -- \ - --max-threads 64 - echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) - cat $PERF_JSON | jq . - echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) - cat $PERF_CSV - echo "Benchmark completed successfully!" + EPOCH=$(date +%s) + ## utility functions -- can be moved to a bash script / configmap + wait_for_model_ready() { + echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." + while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" + sleep 5 + done + echo "✅ Model '$TARGET_MODEL' is now available!" + echo "Model '$TARGET_MODEL' is now available!" + curl -s "http://$ENDPOINT/v1/models" | jq . + } + run_perf() { + local concurrency=$1 + local isl=$2 + local osl=$3 + local max_threads=${concurrency} + key=concurrency_${concurrency} + export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" + mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + aiperf profile --artifact-dir $ARTIFACT_DIR \ + --model $TARGET_MODEL \ + --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ + --endpoint-type chat --endpoint /v1/chat/completions \ + --streaming \ + --url http://$ENDPOINT \ + --synthetic-input-tokens-mean $isl \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean $osl \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:$osl \ + --extra-inputs min_tokens:$osl \ + --extra-inputs ignore_eos:true \ + --extra-inputs repetition_penalty:1.0 \ + --extra-inputs temperature:0.0 \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency $concurrency \ + --request-count $((10*concurrency)) \ + --warmup-request-count $concurrency \ + --conversation-num 12800 \ + --random-seed 100 \ + --workers-max $max_threads \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream'\ + --record-processors 32 \ + --ui simple + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + ls -la $ARTIFACT_DIR + } + #### Actual execution #### + wait_for_model_ready + mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" + # Write input_config.json + cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" </dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." - sleep $INTERVAL - done - echo "✅ Model '$TARGET_MODEL' is now available!" - curl -s "http://$ENDPOINT/v1/models" | jq . - # now run the benchmark - export ARTIFACT_DIR="/tmp/genai-$RANDOM" - mkdir -p "$ARTIFACT_DIR" - echo "Running benchmark..." + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 - aiperf profile \ - --model "$TARGET_MODEL" \ - --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --url "$ENDPOINT" --streaming \ - --concurrency 64 \ - --warmup-request-count 2 \ - --request-count 320 \ - --extra-inputs max_tokens:1024 \ - --synthetic-input-tokens-mean 8192 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 1024 \ - --output-tokens-stddev 0 \ - --extra-inputs min_tokens:1024 \ - --extra-inputs ignore_eos:true \ - --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --random-seed 1418186270 \ - --artifact-dir $ARTIFACT_DIR \ - --num-dataset-entries=3000 -- \ - --max-threads 64 - echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) - cat $PERF_JSON | jq . - echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) - cat $PERF_CSV - echo "Benchmark completed successfully!" + EPOCH=$(date +%s) + ## utility functions -- can be moved to a bash script / configmap + wait_for_model_ready() { + echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." + while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" + sleep 5 + done + echo "✅ Model '$TARGET_MODEL' is now available!" + echo "Model '$TARGET_MODEL' is now available!" + curl -s "http://$ENDPOINT/v1/models" | jq . + } + run_perf() { + local concurrency=$1 + local isl=$2 + local osl=$3 + local max_threads=${concurrency} + key=concurrency_${concurrency} + export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" + mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + aiperf profile --artifact-dir $ARTIFACT_DIR \ + --model $TARGET_MODEL \ + --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ + --endpoint-type chat --endpoint /v1/chat/completions \ + --streaming \ + --url http://$ENDPOINT \ + --synthetic-input-tokens-mean $isl \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean $osl \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:$osl \ + --extra-inputs min_tokens:$osl \ + --extra-inputs ignore_eos:true \ + --extra-inputs repetition_penalty:1.0 \ + --extra-inputs temperature:0.0 \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency $concurrency \ + --request-count $((10*concurrency)) \ + --warmup-request-count $concurrency \ + --conversation-num 12800 \ + --random-seed 100 \ + --workers-max $max_threads \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream'\ + --record-processors 32 \ + --ui simple + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + ls -la $ARTIFACT_DIR + } + #### Actual execution #### + wait_for_model_ready + mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" + # Write input_config.json + cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" < Date: Thu, 16 Oct 2025 17:45:49 -0400 Subject: [PATCH 09/26] fix: copy commit info in trtllm build (#3619) (#3670) Signed-off-by: Anant Sharma --- container/Dockerfile.trtllm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm index 273d746d82ce..6776a67ce127 100644 --- a/container/Dockerfile.trtllm +++ b/container/Dockerfile.trtllm @@ -191,8 +191,10 @@ ARG HAS_TRTLLM_CONTEXT ARG TENSORRTLLM_PIP_WHEEL ARG TENSORRTLLM_INDEX_URL -# Copy only wheel files from trtllm_wheel stage from build_context +# Copy only wheel files and commit info from trtllm_wheel stage from build_context COPY --from=trtllm_wheel /*.whl /trtllm_wheel/ +COPY --from=trtllm_wheel /*.txt /trtllm_wheel/ + # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel # because there might be mismatched versions of TensorRT between the NGC PyTorch # and the TRTLLM wheel. From 048ebd8ffd1c0d649cd3eb5667372c15d685facf Mon Sep 17 00:00:00 2001 From: Anthony Casagrande Date: Thu, 16 Oct 2025 15:46:36 -0700 Subject: [PATCH 10/26] fix: update invalid AIPerf scripts and parsing logic (#3681) Signed-off-by: Anthony Casagrande --- benchmarks/llm/perf.sh | 3 +- benchmarks/profiler/profile_sla.py | 8 +- benchmarks/profiler/utils/profile_decode.py | 6 +- benchmarks/profiler/utils/profile_prefill.py | 2 +- benchmarks/router/README.md | 8 +- benchmarks/sin_load_generator/README.md | 2 +- docs/backends/trtllm/gpt-oss.md | 16 ++-- recipes/gpt-oss-120b/trtllm/agg/perf.yaml | 13 +-- recipes/llama-3-70b/vllm/agg/perf.yaml | 3 +- .../vllm/disagg-multi-node/perf.yaml | 3 +- .../vllm/disagg-single-node/perf.yaml | 3 +- tests/fault_tolerance/deploy/client.py | 20 ++-- tests/fault_tolerance/deploy/parse_results.py | 67 +++++-------- tests/planner/README.md | 8 +- tests/planner/utils/load_generator.py | 94 +++++++------------ 15 files changed, 99 insertions(+), 157 deletions(-) diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh index 446ec1f74d0c..a1f66dc01710 100644 --- a/benchmarks/llm/perf.sh +++ b/benchmarks/llm/perf.sh @@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do --num-dataset-entries $(($concurrency*12)) \ --random-seed 100 \ --artifact-dir ${artifact_dir} \ - -- \ + --ui simple \ -v \ - --max-threads ${concurrency} \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 8c670986cffa..757c073b2851 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -256,7 +256,7 @@ async def run_profile(args): base_url=base_url, ) if aiperf_result is not None: - ttft = aiperf_result["records"]["ttft"]["avg"] + ttft = aiperf_result["time_to_first_token"]["avg"] logger.info("Cleaning up deployment...") await client.delete_deployment() @@ -435,11 +435,9 @@ async def run_profile(args): base_url=base_url, ) if aiperf_result is not None: - itl = aiperf_result["records"]["inter_token_latency"]["avg"] + itl = aiperf_result["inter_token_latency"]["avg"] thpt_per_gpu = ( - aiperf_result["records"]["output_token_throughput"][ - "avg" - ] + aiperf_result["output_token_throughput"]["avg"] / num_gpus ) diff --git a/benchmarks/profiler/utils/profile_decode.py b/benchmarks/profiler/utils/profile_decode.py index 1a9cbf3d96fa..f0a819ec5dec 100644 --- a/benchmarks/profiler/utils/profile_decode.py +++ b/benchmarks/profiler/utils/profile_decode.py @@ -124,10 +124,8 @@ def get_itl_and_thpt_per_gpu(isl, osl, num_request): base_url=url, ) if aiperf_result is not None: - itl = aiperf_result["records"]["inter_token_latency"]["avg"] - thpt_per_gpu = ( - aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus - ) + itl = aiperf_result["inter_token_latency"]["avg"] + thpt_per_gpu = aiperf_result["output_token_throughput"]["avg"] / num_gpus return itl, thpt_per_gpu return None, None diff --git a/benchmarks/profiler/utils/profile_prefill.py b/benchmarks/profiler/utils/profile_prefill.py index d7f5dae91bf0..48171bdd7e63 100644 --- a/benchmarks/profiler/utils/profile_prefill.py +++ b/benchmarks/profiler/utils/profile_prefill.py @@ -90,7 +90,7 @@ def get_ttft(isl): base_url=url, ) if aiperf_result is not None: - return aiperf_result["records"]["ttft"]["avg"] + return aiperf_result["time_to_first_token"]["avg"] return None return _profile_prefill_helper( diff --git a/benchmarks/router/README.md b/benchmarks/router/README.md index 8ea830b759ac..40d8f127fd6b 100644 --- a/benchmarks/router/README.md +++ b/benchmarks/router/README.md @@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c - etcd and NATS running (required for Dynamo coordination) - Required Python packages: - `dynamo` package (with vllm and frontend modules) - - `genai-perf` for benchmarking + - `aiperf` for benchmarking - `matplotlib` for plotting results - `data-generator` package (install with `pip install -e ./benchmarks` from repo root) @@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli ``` > [!Note] -> At the time of writing this documentation, you may need to install the latest genai-perf from the main source branch to loadgen on the trace files: +> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files: > ```bash -> pip install git+https://github.com/triton-inference-server/perf_analyzer.git#subdirectory=genai-perf +> pip install git+https://github.com/ai-dynamo/aiperf.git > ``` -> However, by the time of release, the genai-perf version included in the vLLM runtime container should be up to date enough to use as-is. +> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is. ## Troubleshooting diff --git a/benchmarks/sin_load_generator/README.md b/benchmarks/sin_load_generator/README.md index 7c3ec5cf303a..82b7dee5b9c5 100644 --- a/benchmarks/sin_load_generator/README.md +++ b/benchmarks/sin_load_generator/README.md @@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0 # Sinusoidal Load Generator -`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf). +`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf). ## Usage diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 6a11712724ae..071b88bb2e2d 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -402,9 +402,9 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ``` ## Benchmarking -### Performance Testing with GenAI-Perf +### Performance Testing with AIPerf -The Dynamo container includes [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. +The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. **Run the following benchmark from inside the container** (after completing the deployment steps above): @@ -413,7 +413,7 @@ The Dynamo container includes [GenAI-Perf](https://docs.nvidia.com/deeplearning/ mkdir -p /tmp/benchmark-results # Run the benchmark - this command tests the deployment with high-concurrency synthetic workload -genai-perf profile \ +aiperf profile \ --model openai/gpt-oss-120b \ --tokenizer /model \ --endpoint-type chat \ @@ -434,9 +434,7 @@ genai-perf profile \ --num-dataset-entries 8000 \ --random-seed 100 \ --artifact-dir /tmp/benchmark-results \ - -- \ -v \ - --max-threads 500 \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' ``` @@ -457,13 +455,13 @@ Key parameters you can adjust: - `--output-tokens-mean`: Average output length (tests decode throughput) - `--request-count`: Total number of requests for the benchmark -### Installing GenAI-Perf Outside the Container +### Installing AIPerf Outside the Container If you prefer to run benchmarks from outside the container: ```bash -# Install GenAI-Perf -pip install genai-perf +# Install AIPerf +pip install aiperf # Then run the same benchmark command, adjusting the tokenizer path if needed ``` @@ -520,4 +518,4 @@ flowchart TD - **Production Deployment**: For multi-node deployments, see the [Multi-node Guide](../../../examples/basics/multinode/README.md) - **Advanced Configuration**: Explore TensorRT-LLM engine building options for further optimization - **Monitoring**: Set up Prometheus and Grafana for production monitoring -- **Performance Benchmarking**: Use GenAI-Perf to measure and optimize your deployment performance +- **Performance Benchmarking**: Use AIPerf to measure and optimize your deployment performance diff --git a/recipes/gpt-oss-120b/trtllm/agg/perf.yaml b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml index eed5d69addbf..a1dbbd696aba 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/perf.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml @@ -57,19 +57,20 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ --synthetic-input-tokens-stddev 0 \ --output-tokens-mean $osl \ --output-tokens-stddev 0 \ - --extra-inputs "{\"max_tokens\":$osl}" \ - --extra-inputs "{\"min_tokens\":$osl}" \ - --extra-inputs "{\"ignore_eos\":true}" \ + --extra-inputs "max_tokens:$osl" \ + --extra-inputs "min_tokens:$osl" \ + --extra-inputs "ignore_eos:true" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --extra-inputs "{\"repetition_penalty\":1.0}" \ - --extra-inputs "{\"temperature\": 0.0}" \ + --extra-inputs "repetition_penalty:1.0" \ + --extra-inputs "temperature: 0.0" \ --concurrency $concurrency \ --request-count $((10*concurrency)) \ --warmup-request-count $concurrency \ diff --git a/recipes/llama-3-70b/vllm/agg/perf.yaml b/recipes/llama-3-70b/vllm/agg/perf.yaml index 8c5a470f119c..5773214bf438 100644 --- a/recipes/llama-3-70b/vllm/agg/perf.yaml +++ b/recipes/llama-3-70b/vllm/agg/perf.yaml @@ -50,7 +50,8 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ diff --git a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml index e2326e45873b..8b24296f828b 100644 --- a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml @@ -50,7 +50,8 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ diff --git a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml index 61e53aa5a79e..c2ac8445c589 100644 --- a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml @@ -50,7 +50,8 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ diff --git a/tests/fault_tolerance/deploy/client.py b/tests/fault_tolerance/deploy/client.py index e3053008c43b..e8a6ec24c231 100644 --- a/tests/fault_tolerance/deploy/client.py +++ b/tests/fault_tolerance/deploy/client.py @@ -383,27 +383,19 @@ def log_summary_metrics( with open(profile_json) as f: metrics = json.load(f) - # Extract key metrics from AI-Perf format - records = metrics.get("records", {}) - - # Request count from request_count record - request_count_record = records.get("request_count", {}) - request_count = ( - int(request_count_record.get("avg", 0)) if request_count_record else 0 - ) + # Request count + request_count = int(metrics.get("request_count", {}).get("avg", 0)) # Check for errors - error_summary = metrics.get("error_summary", []) - error_count = len(error_summary) + error_count = len(metrics.get("error_summary", [])) # Latency metrics (in milliseconds) - request_latency = records.get("request_latency", {}) + request_latency = metrics.get("request_latency", {}) avg_latency = request_latency.get("avg", 0) / 1000.0 # Convert to seconds p99_latency = request_latency.get("p99", 0) / 1000.0 # Convert to seconds # Throughput metrics - request_throughput = records.get("request_throughput", {}) - throughput = request_throughput.get("avg", 0) + throughput = metrics.get("request_throughput", {}).get("avg", 0) # Log summary logger.info( @@ -417,7 +409,7 @@ def log_summary_metrics( # Log success rate if request_count > 0: - success_rate = (request_count - error_count) / request_count * 100 + success_rate = ((request_count - error_count) / request_count) * 100 logger.info(f"Success rate: {success_rate:.1f}%") # Also write summary to CSV file for aggregation diff --git a/tests/fault_tolerance/deploy/parse_results.py b/tests/fault_tolerance/deploy/parse_results.py index e41275cd44c7..77f28894d31f 100644 --- a/tests/fault_tolerance/deploy/parse_results.py +++ b/tests/fault_tolerance/deploy/parse_results.py @@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]: with open(profile_json) as f: client_metrics = json.load(f) - # AI-Perf format has "records" dictionary at the top level - records = client_metrics.get("records", {}) - - # Extract request count (this is the total requests made) - request_count_record = records.get("request_count", {}) - request_count = ( - int(request_count_record.get("avg", 0)) - if request_count_record - else 0 + # Extract request count (this is the total successful requests made) + request_count = int( + client_metrics.get("request_count", {}).get("avg", 0) ) # Check for errors in error_summary - error_summary = client_metrics.get("error_summary", []) - error_count = len(error_summary) + error_count = len(client_metrics.get("error_summary", [])) # Check if test was cancelled - was_cancelled = client_metrics.get("was_cancelled", False) - if was_cancelled: + if client_metrics.get("was_cancelled", False): error_count = request_count # Mark all as failed if cancelled all_metrics["total_requests"] += request_count all_metrics["successful_requests"] += request_count - error_count all_metrics["failed_requests"] += error_count - # Extract latency from request_latency record - request_latency = records.get("request_latency", {}) - + # Extract latency metrics + request_latency = client_metrics.get("request_latency", None) if request_latency: - # Convert milliseconds to seconds for consistency - if "avg" in request_latency: - all_metrics["latencies"].append(request_latency["avg"] / 1000.0) - if "p50" in request_latency: - all_metrics["p50_latencies"].append( - request_latency["p50"] / 1000.0 - ) - if "p90" in request_latency: - all_metrics["p90_latencies"].append( - request_latency["p90"] / 1000.0 - ) - if "p99" in request_latency: - all_metrics["p99_latencies"].append( - request_latency["p99"] / 1000.0 - ) - - # Time to first token (if available in records) - ttft = records.get("time_to_first_token", {}) or records.get("ttft", {}) - if ttft and "avg" in ttft: - all_metrics["ttft"].append(ttft["avg"] / 1000.0) # Convert ms to s - - # Inter-token latency (if available in records) - itl = records.get("inter_token_latency", {}) or records.get("itl", {}) - if itl and "avg" in itl: - all_metrics["itl"].append(itl["avg"] / 1000.0) # Convert ms to s + all_metrics["latencies"].append(request_latency["avg"] / 1000.0) + all_metrics["p50_latencies"].append(request_latency["p50"] / 1000.0) + all_metrics["p90_latencies"].append(request_latency["p90"] / 1000.0) + all_metrics["p99_latencies"].append(request_latency["p99"] / 1000.0) + + # Time to first token + ttft = client_metrics.get("time_to_first_token", {}).get("avg", None) + if ttft: + all_metrics["ttft"].append(ttft / 1000.0) # Convert ms to s + + # Inter-token latency + itl = client_metrics.get("inter_token_latency", {}).get("avg", None) + if itl: + all_metrics["itl"].append(itl / 1000.0) # Convert ms to s # Throughput from request_throughput record - request_throughput = records.get("request_throughput", {}) - req_throughput = request_throughput.get("avg", 0) + req_throughput = client_metrics.get("request_throughput", {}).get( + "avg", 0 + ) if req_throughput: all_metrics["throughputs"].append(req_throughput) diff --git a/tests/planner/README.md b/tests/planner/README.md index 4c1566cc1beb..e9fdcbe44372 100644 --- a/tests/planner/README.md +++ b/tests/planner/README.md @@ -215,10 +215,10 @@ When running deployment with sla-planner, to reduce the image pulling time, depl kubectl apply -f ./perf_test_configs/image_cache_daemonset.yaml -n ``` -Then, port-forward or shell into the frontend pod and run GenAI-Perf to get the goodput: +Then, port-forward or shell into the frontend pod and run AIPerf to get the goodput: ```bash -genai-perf profile \ +aiperf profile \ --model nvidia/Llama-3.1-8B-Instruct-FP8 \ --tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \ --endpoint-type chat \ @@ -227,11 +227,11 @@ genai-perf profile \ --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \ --fixed-schedule True \ --goodput time_to_first_token:200 inter_token_latency:10 \ - -- -v -max-threads 64 \ + -v ``` > [!NOTE] -> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause GenAI-Perf to stuck. We are aware of this issue and are working on fixing it. +> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause AIPerf to stuck. We are aware of this issue and are working on fixing it. #### E2E Perf Test Results diff --git a/tests/planner/utils/load_generator.py b/tests/planner/utils/load_generator.py index 69943de09411..6ac5e38a76d0 100644 --- a/tests/planner/utils/load_generator.py +++ b/tests/planner/utils/load_generator.py @@ -4,7 +4,7 @@ """ Load generation script for SLA planner scaling tests. -This script uses genai-perf to generate load at specific request rates +This script uses aiperf to generate load at specific request rates to test the planner's scaling behavior. """ @@ -24,7 +24,7 @@ class LoadGenerator: - """Generate load using genai-perf to test planner scaling.""" + """Generate load using aiperf to test planner scaling.""" def __init__( self, @@ -40,12 +40,12 @@ def __init__( self.osl = osl self.save_results = save_results - def _calculate_genai_perf_params( + def _calculate_aiperf_params( self, req_per_sec: float, ) -> Dict[str, Any]: """ - Calculate genai-perf parameters to approximate desired request rate. + Calculate aiperf parameters to approximate desired request rate. Args: req_per_sec: Desired requests per second @@ -71,15 +71,15 @@ async def generate_load( Args: req_per_sec: Target requests per second duration_sec: Duration to generate load (seconds) - artifact_dir: Directory to store genai-perf artifacts + artifact_dir: Directory to store aiperf artifacts Returns: Dictionary with load test results """ logger.info(f"Generating load: {req_per_sec} req/s for {duration_sec}s") - # Calculate genai-perf parameters - params = self._calculate_genai_perf_params(req_per_sec) + # Calculate aiperf parameters + params = self._calculate_aiperf_params(req_per_sec) logger.info(f"Using request_rate={params['request_rate']} req/s") # Create artifact directory if not provided @@ -95,9 +95,9 @@ async def generate_load( f"Adjusted parameters: duration={duration_sec}s, request_count={request_count}" ) - # Build genai-perf command based on coworker's successful approach + # Build aiperf command based on coworker's successful approach cmd = [ - "genai-perf", + "aiperf", "profile", "--model", self.model, @@ -116,18 +116,13 @@ async def generate_load( str(params["request_rate"]), "--request-count", str(request_count), # Use request count to limit test duration - "--stability-percentage", - "50", "--num-dataset-entries", str( max(20, int(params["request_rate"] * 10)) ), # Generate reasonable dataset size "--artifact-dir", artifact_dir, - "--", "-v", - "-max-threads", - "64", ] logger.info(f"Running command: {' '.join(cmd)}") @@ -135,7 +130,7 @@ async def generate_load( f"Expected duration: {duration_sec}s, timeout: {max(duration_sec * 2 + 120, int(duration_sec * 2.5))}s" ) - # Run genai-perf (async) + # Run aiperf (async) start_time = time.time() # More generous timeout for high-load tests - allow 2x duration + 2 minutes buffer timeout = max(duration_sec * 2 + 120, int(duration_sec * 2.5)) @@ -152,7 +147,7 @@ async def generate_load( except asyncio.TimeoutError: proc.kill() await proc.communicate() - logger.error("genai-perf timed out") + logger.error("aiperf timed out") raise RuntimeError("Load generation timed out") end_time = time.time() @@ -160,13 +155,9 @@ async def generate_load( # Persist logs for debugging try: - with open( - os.path.join(artifact_dir, "genai_perf.stdout.log"), "wb" - ) as f: + with open(os.path.join(artifact_dir, "aiperf.stdout.log"), "wb") as f: f.write(stdout or b"") - with open( - os.path.join(artifact_dir, "genai_perf.stderr.log"), "wb" - ) as f: + with open(os.path.join(artifact_dir, "aiperf.stderr.log"), "wb") as f: f.write(stderr or b"") except Exception: pass @@ -174,31 +165,31 @@ async def generate_load( if proc.returncode == 0: logger.info("Load generation completed successfully") logger.info(f"Actual duration: {actual_duration:.2f}s") - results = self._parse_genai_perf_results(artifact_dir) + results = self._parse_aiperf_results(artifact_dir) results.update( { "requested_req_per_sec": req_per_sec, "actual_duration": actual_duration, "target_duration": duration_sec, - "genai_perf_params": params, + "aiperf_params": params, "artifact_dir": artifact_dir, "success": True, } ) return results else: - logger.error(f"genai-perf failed with return code {proc.returncode}") - raise RuntimeError("genai-perf failed; see logs in artifact dir") + logger.error(f"aiperf failed with return code {proc.returncode}") + raise RuntimeError("aiperf failed; see logs in artifact dir") except RuntimeError: raise except Exception as e: - logger.error(f"genai-perf execution error: {e}") + logger.error(f"aiperf execution error: {e}") raise - def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]: - """Parse genai-perf results from artifact directory.""" + def _parse_aiperf_results(self, artifact_dir: str) -> Dict[str, Any]: + """Parse aiperf results from artifact directory.""" try: - # Look for the profile_export_genai_perf.json file + # Look for the profile_export_aiperf.json file json_files = [f for f in os.listdir(artifact_dir) if f.endswith(".json")] if not json_files: logger.warning("No JSON results found in artifact directory") @@ -207,7 +198,7 @@ def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]: # Main results file results_file = None for json_file in json_files: - if "profile_export" in json_file or "genai_perf" in json_file: + if "profile_export" in json_file or "aiperf" in json_file: results_file = os.path.join(artifact_dir, json_file) break @@ -217,40 +208,21 @@ def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]: logger.info(f"Parsing results from: {results_file}") with open(results_file, "r") as f: - data = json.load(f) - - results = {} - if "experiments" in data and data["experiments"]: - exp = data["experiments"][0] - if "perf_metrics" in exp: - metrics = exp["perf_metrics"] - results.update( - { - "throughput": metrics.get("throughput", {}).get("avg", 0), - "ttft_mean": metrics.get("ttft", {}).get("avg", 0), - "itl_mean": metrics.get("inter_token_latency", {}).get( - "avg", 0 - ), - "end_to_end_latency_mean": metrics.get( - "request_latency", {} - ).get("avg", 0), - } - ) - if not results and "profile_export_genai_perf" in data: - summary = data.get("summary", {}) - results.update( - { - "throughput": summary.get("throughput", 0), - "ttft_mean": summary.get("time_to_first_token_ms", 0), - "itl_mean": summary.get("inter_token_latency_ms", 0), - } - ) - + metrics = json.load(f) + + results = { + "throughput": metrics.get("output_token_throughput", {}).get("avg", 0), + "ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0), + "itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0), + "end_to_end_latency_mean": metrics.get("request_latency", {}).get( + "avg", 0 + ), + } logger.info(f"Parsed results: {results}") return results except Exception as e: - logger.warning(f"Failed to parse genai-perf results: {e}") + logger.warning(f"Failed to parse aiperf results: {e}") return {} async def run_scaling_test(self) -> Dict[str, Any]: From cbe523fe414b0708e6f23e2c205488b36d65a744 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Thu, 16 Oct 2025 18:55:30 -0400 Subject: [PATCH 11/26] fix: aiconfigurator breaking tests due to not being installed correctly (#3686) Signed-off-by: Harrison Saturley-Hall --- Earthfile | 2 +- benchmarks/profiler/utils/estimate_perf.py | 4 +++- benchmarks/pyproject.toml | 2 +- container/Dockerfile | 8 ++++++-- container/Dockerfile.sglang | 9 +++++++-- container/Dockerfile.trtllm | 8 ++++++-- container/Dockerfile.vllm | 8 ++++++-- tests/planner/README.md | 2 +- tests/profiler/test_profile_sla_aiconfigurator.py | 2 +- 9 files changed, 32 insertions(+), 13 deletions(-) diff --git a/Earthfile b/Earthfile index 9e7e35d9d5b7..57a307cbe69e 100644 --- a/Earthfile +++ b/Earthfile @@ -159,7 +159,7 @@ dynamo-base-docker: ENV VIRTUAL_ENV=/opt/dynamo/venv ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" - RUN uv pip install -r /tmp/requirements.txt + RUN UV_GIT_LFS=1 uv pip install -r /tmp/requirements.txt # Copy and install wheels -- ai-dynamo-runtime first, then ai-dynamo COPY +dynamo-build/ai_dynamo_runtime*.whl /tmp/wheels/ diff --git a/benchmarks/profiler/utils/estimate_perf.py b/benchmarks/profiler/utils/estimate_perf.py index b4f4fc97e17b..a6abc0d0968d 100644 --- a/benchmarks/profiler/utils/estimate_perf.py +++ b/benchmarks/profiler/utils/estimate_perf.py @@ -66,7 +66,9 @@ def _get_model(self, **model_config_kwargs): # NOTE: MOE models error out unless moe_tp_size and moe_ep_size are provided. model_config = aiconfigurator.sdk.config.ModelConfig(**model_config_kwargs) - model = aiconfigurator.sdk.models.get_model(self.model_name, model_config) + model = aiconfigurator.sdk.models.get_model( + self.model_name, model_config, self.backend + ) return model def estimate_perf( diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index 0f3da13238ef..4eb0dedf0046 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ ] dependencies = [ - "aiconfigurator==0.2.0", + "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@e46d9089ffe4f5dd62c46914489c55b6dfdbc903", "networkx", "pandas", "pydantic>=2", diff --git a/container/Dockerfile b/container/Dockerfile index b52c5c3bd712..bf17e8ee9abb 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -116,6 +116,7 @@ RUN apt-get update -y \ automake \ cmake \ git \ + git-lfs \ libtool \ meson \ net-tools \ @@ -261,7 +262,10 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \ # Install common and test dependencies RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ - uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt + UV_GIT_LFS=1 uv pip install \ + --no-cache \ + --requirement /tmp/requirements.txt \ + --requirement /tmp/requirements.test.txt ################################## ##### Wheel Build Image ########## @@ -382,7 +386,7 @@ RUN uv pip install \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \ && cd /opt/dynamo/benchmarks \ - && uv pip install . \ + && UV_GIT_LFS=1 uv pip install --no-cache . \ && cd - \ && rm -rf /opt/dynamo/benchmarks diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index e892ffc50b9b..92bb8c03ce8f 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -65,6 +65,7 @@ RUN apt-get update -y \ python${PYTHON_VERSION}-dev \ build-essential \ git \ + git-lfs \ # SGLang build dependencies cmake \ ibverbs-providers \ @@ -149,6 +150,7 @@ RUN apt-get update && \ # jq and curl for polling various endpoints and health checks jq \ git \ + git-lfs \ curl \ # Libraries required by UCX to find RDMA devices libibverbs1 rdma-core ibverbs-utils libibumad3 \ @@ -206,14 +208,17 @@ RUN uv pip install \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \ && cd /opt/dynamo/benchmarks \ - && uv pip install . \ + && UV_GIT_LFS=1 uv pip install --no-cache . \ && cd - \ && rm -rf /opt/dynamo/benchmarks # Install common and test dependencies RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ - uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt + UV_GIT_LFS=1 uv pip install \ + --no-cache \ + --requirement /tmp/requirements.txt \ + --requirement /tmp/requirements.test.txt # Copy launch banner RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \ diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm index 6776a67ce127..acd10a36214e 100644 --- a/container/Dockerfile.trtllm +++ b/container/Dockerfile.trtllm @@ -77,6 +77,7 @@ RUN apt-get update && \ g++ \ ninja-build \ git \ + git-lfs \ # Python runtime - CRITICAL for virtual environment to work python${PYTHON_VERSION}-dev \ python3-pip \ @@ -242,14 +243,17 @@ RUN uv pip install \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \ && cd /opt/dynamo/benchmarks \ - && uv pip install . \ + && UV_GIT_LFS=1 uv pip install --no-cache . \ && cd - \ && rm -rf /opt/dynamo/benchmarks # Install common and test dependencies RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ - uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt + UV_GIT_LFS=1 uv pip install \ + --no-cache \ + --requirement /tmp/requirements.txt \ + --requirement /tmp/requirements.test.txt # Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/ COPY --from=framework /opt/hpcx /opt/hpcx diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index d2337637953e..163ad08f51e3 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -192,6 +192,7 @@ RUN apt-get update && \ # jq and curl for polling various endpoints and health checks jq \ git \ + git-lfs \ curl \ # Libraries required by UCX to find RDMA devices libibverbs1 rdma-core ibverbs-utils libibumad3 \ @@ -257,14 +258,17 @@ RUN uv pip install \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \ && cd /opt/dynamo/benchmarks \ - && uv pip install . \ + && UV_GIT_LFS=1 uv pip install --no-cache . \ && cd - \ && rm -rf /opt/dynamo/benchmarks # Install common and test dependencies RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ - uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt + UV_GIT_LFS=1 uv pip install \ + --no-cache \ + --requirement /tmp/requirements.txt \ + --requirement /tmp/requirements.test.txt # Copy benchmarks, examples, and tests for CI COPY . /workspace/ diff --git a/tests/planner/README.md b/tests/planner/README.md index e9fdcbe44372..14a7112f715b 100644 --- a/tests/planner/README.md +++ b/tests/planner/README.md @@ -10,7 +10,7 @@ This directory contains comprehensive testing tools for validating the SLA plann The SLA planner monitors metrics every 60 seconds (default adjustment interval) and scales prefill/decode workers based on TTFT, ITL, and request patterns. -To setup the environment, simply use the released docker images for any backends, or build your own docker image following the READMEs in `./components/backends//README.md`, or follow the `Developing Locally` section in [README.md](../../README.md) to setup the environment locally. If using the local environment, make sure to install dependencies by running `uv pip install -r container/deps/requirements.txt` +To setup the environment, simply use the released docker images for any backends, or build your own docker image following the READMEs in `./components/backends//README.md`, or follow the `Developing Locally` section in [README.md](../../README.md) to setup the environment locally. If using the local environment, make sure to install dependencies by running `UV_GIT_LFS=1 uv pip install --no-cache -r container/deps/requirements.txt` ## Pre-Requisite: Pre-Deployment Profiling Data diff --git a/tests/profiler/test_profile_sla_aiconfigurator.py b/tests/profiler/test_profile_sla_aiconfigurator.py index 154fe2ddd26a..7d21cefe1fe3 100644 --- a/tests/profiler/test_profile_sla_aiconfigurator.py +++ b/tests/profiler/test_profile_sla_aiconfigurator.py @@ -97,7 +97,7 @@ async def test_trtllm_aiconfigurator_single_model(self, trtllm_args): ("trtllm", "1.0.0rc3"), ], ) - @pytest.mark.parametrize("model_name", ["QWEN3_32B", "GPT_7B", "LLAMA3.1_405B"]) + @pytest.mark.parametrize("model_name", ["QWEN3_32B", "LLAMA3.1_405B"]) async def test_trtllm_aiconfigurator_many( self, trtllm_args, model_name, backend, aic_backend_version ): From 6ef659c3ca70f0ac85daa04b78534969d7384d44 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Thu, 16 Oct 2025 19:00:03 -0400 Subject: [PATCH 12/26] feat: Replace genai-perf with aiperf in components/backends (#3528) (#3682) Signed-off-by: lkomali Signed-off-by: Harrison Saturley-Hall Co-authored-by: Harshini Komali <157742537+lkomali@users.noreply.github.com> --- .../backends/sglang/slurm_jobs/scripts/gap/bench.sh | 12 +++++------- components/backends/trtllm/deploy/README.md | 2 +- .../backends/trtllm/performance_sweeps/README.md | 8 ++++---- .../trtllm/performance_sweeps/benchmark_agg.slurm | 2 +- .../trtllm/performance_sweeps/post_process.py | 8 ++++---- .../trtllm/performance_sweeps/scripts/bench.sh | 8 +++----- docs/backends/trtllm/README.md | 4 ++-- docs/backends/trtllm/gpt-oss.md | 2 +- 8 files changed, 21 insertions(+), 25 deletions(-) diff --git a/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh b/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh index ccb4708ddfc6..7eee4fee37d9 100755 --- a/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh +++ b/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh @@ -30,14 +30,14 @@ set -e warmup_model $head_node $head_port $SERVED_MODEL_NAME $MODEL_PATH "${chosen_isl}x${chosen_osl}x10000x10000x250" set +e -genai_perf_warmup_workers=$(python3 -c "print(max(${DP:-0}, ${prefill_workers:-0}, ${decode_workers:-0}))") +aiperf_warmup_workers=$(python3 -c "print(max(${DP:-0}, ${prefill_workers:-0}, ${decode_workers:-0}))") IFS='x' read -r -a concurrency_list <<< "$chosen_concurrencies" profile_folder="/logs/gap_isl_${chosen_isl}_osl_${chosen_osl}" mkdir -p $profile_folder -tmp_work_dir=$(mktemp -d -t genai-perf-XXXXXXXX) +tmp_work_dir=$(mktemp -d -t aiperf-XXXXXXXX) for concurrency in ${concurrency_list[@]}; do export_folder="${tmp_work_dir}/concurrency_${concurrency}" mkdir -p $export_folder @@ -46,7 +46,7 @@ for concurrency in ${concurrency_list[@]}; do echo "Run benchmark for concurrency $concurrency; ISL $chosen_isl; OSL $chosen_osl" command=( - genai-perf profile + aiperf profile -m ${SERVED_MODEL_NAME} --tokenizer ${MODEL_PATH} --endpoint-type chat @@ -55,7 +55,7 @@ for concurrency in ${concurrency_list[@]}; do --streaming --concurrency ${concurrency} - --warmup-request-count $(( 2*genai_perf_warmup_workers )) + --warmup-request-count $(( 2*aiperf_warmup_workers )) --request-count $(( 5*concurrency )) --synthetic-input-tokens-mean ${chosen_isl} --synthetic-input-tokens-stddev 0 @@ -69,13 +69,11 @@ for concurrency in ${concurrency_list[@]}; do --tokenizer-trust-remote-code --num-dataset-entries 3000 - -- - --max-threads ${concurrency} ) set -e ${command[@]} set +e - cp $export_folder/*/*_genai_perf.json $profile_folder + cp $export_folder/*/*_aiperf.json $profile_folder done diff --git a/components/backends/trtllm/deploy/README.md b/components/backends/trtllm/deploy/README.md index 8e2d24425a44..ac507663612f 100644 --- a/components/backends/trtllm/deploy/README.md +++ b/components/backends/trtllm/deploy/README.md @@ -271,7 +271,7 @@ args: ## Benchmarking -To benchmark your deployment with GenAI-Perf, see this utility script: [perf.sh](../../../../benchmarks/llm/perf.sh) +To benchmark your deployment with AIPerf, see this utility script: [perf.sh](../../../../benchmarks/llm/perf.sh) Configure the `model` name and `host` based on your deployment. diff --git a/components/backends/trtllm/performance_sweeps/README.md b/components/backends/trtllm/performance_sweeps/README.md index aaec28f5436a..0e3a4de174c8 100644 --- a/components/backends/trtllm/performance_sweeps/README.md +++ b/components/backends/trtllm/performance_sweeps/README.md @@ -38,7 +38,7 @@ Please note that: 1. `submit_disagg.sh` - Main entry point for submitting benchmark jobs for disaggregated configurations. This includes WideEP optimization for DEP>=16. 2. `submit_agg.sh` - Main entry point for submitting benchmark jobs for aggregated configurations. -3. `post_process.py` - Scan the genai-perf results to produce a json with entries to each config point. +3. `post_process.py` - Scan the aiperf results to produce a json with entries to each config point. 4. `plot_performance_comparison.py` - Takes the json result file for disaggregated and/or aggregated configuration sweeps and plots a pareto line for better visualization. For more finer grained details on how to launch TRTLLM backend workers with DeepSeek R1 on GB200 slurm, please refer [multinode-examples.md](../../../../docs/backends/trtllm/multinode/multinode-examples.md). This guide shares similar assumption to the multinode examples guide. @@ -117,9 +117,9 @@ export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" ## Post-Processing Results -The above jobs use genAI-perf tool to benchmark each configuration point across different concurrency values. These get stored in `dynamo_disagg-bm-8150-1024//genai_perf_artifacts` and `dynamo_agg-bm-8150-1024//genai_perf_artifacts` for disaggregated and aggregated respectively. +The above jobs use aiperf tool to benchmark each configuration point across different concurrency values. These get stored in `dynamo_disagg-bm-8150-1024//aiperf_artifacts` and `dynamo_agg-bm-8150-1024//aiperf_artifacts` for disaggregated and aggregated respectively. -After your benchmarking jobs have completed, you can use the `post_process.py` script to aggregate and summarize the results from the generated genai_perf_artifacts. +After your benchmarking jobs have completed, you can use the `post_process.py` script to aggregate and summarize the results from the generated aiperf_artifacts. To run the post-processing script, use: @@ -149,6 +149,6 @@ Refer to [Beyond the Buzz: A Pragmatic Take on Inference Disaggregation](https:/ ## Known Issues -- Some jobs may time out if genai-perf requires more time to complete all concurrency levels. +- Some jobs may time out if aiperf requires more time to complete all concurrency levels. - Workers may encounter out-of-memory (OOM) errors during inference, especially with larger configurations. - Configurations affected by these issues will result in missing data points on the performance plot. diff --git a/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm index 7d5ca6323547..693274840de5 100755 --- a/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm +++ b/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm @@ -40,7 +40,7 @@ if [ "${enable_attention_dp}" = "false" ]; then fi full_logdir=${sub_dir} -artifacts_dir=${full_logdir}/genai_perf_artifacts +artifacts_dir=${full_logdir}/aiperf_artifacts mkdir -p ${artifacts_dir} diff --git a/components/backends/trtllm/performance_sweeps/post_process.py b/components/backends/trtllm/performance_sweeps/post_process.py index c0a045411067..a9e5ed28c557 100755 --- a/components/backends/trtllm/performance_sweeps/post_process.py +++ b/components/backends/trtllm/performance_sweeps/post_process.py @@ -124,7 +124,7 @@ def extract_throughput_data(csv_path: str) -> Tuple[Optional[float], Optional[fl Extract throughput data from CSV file Args: - csv_path: Path to profile_export_genai_perf.csv + csv_path: Path to profile_export_aiperf.csv Returns: Tuple of (output_token_throughput, output_token_throughput_per_user) @@ -184,10 +184,10 @@ def process_directory(dir_path: str) -> Optional[List[Dict[str, Any]]]: Dictionary containing extracted data, or None if processing failed """ dir_path_obj = Path(dir_path) - artifacts_path = dir_path_obj / "genai_perf_artifacts" + artifacts_path = dir_path_obj / "aiperf_artifacts" if not artifacts_path.exists(): - print(f"Warning: No genai_perf_artifacts directory found in {dir_path}") + print(f"Warning: No aiperf_artifacts directory found in {dir_path}") return None # Parse deployment configuration @@ -205,7 +205,7 @@ def process_directory(dir_path: str) -> Optional[List[Dict[str, Any]]]: csv_files = [] for item in artifacts_path.iterdir(): if item.is_dir(): - csv_path = item / "profile_export_genai_perf.csv" + csv_path = item / "profile_export_aiperf.csv" if csv_path.exists(): csv_files.append(str(csv_path)) diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench.sh b/components/backends/trtllm/performance_sweeps/scripts/bench.sh index c7141b44bbd3..e79c03f08db8 100755 --- a/components/backends/trtllm/performance_sweeps/scripts/bench.sh +++ b/components/backends/trtllm/performance_sweeps/scripts/bench.sh @@ -54,8 +54,8 @@ set -x config_file=${log_path}/config.yaml -# install genai-perf -pip install genai-perf +# install aiperf +pip install aiperf # Create artifacts root directory if it doesn't exist if [ ! -d "${artifacts_dir}" ]; then @@ -153,7 +153,7 @@ for concurrency in ${concurrency_list}; do num_prompts=$((concurrency * multi_round)) echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts" mkdir -p ${log_path}/concurrency_${concurrency} - genai-perf profile \ + aiperf profile \ --model ${model} \ --tokenizer ${model_path} \ --endpoint-type chat \ @@ -174,9 +174,7 @@ for concurrency in ${concurrency_list}; do --num-dataset-entries ${num_prompts} \ --random-seed 100 \ --artifact-dir ${artifacts_dir} \ - -- \ -v \ - --max-threads ${concurrency} \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' echo "Benchmark with concurrency ${concurrency} done" diff --git a/docs/backends/trtllm/README.md b/docs/backends/trtllm/README.md index f5d5fa1d1ba6..12fbb9e5a9db 100644 --- a/docs/backends/trtllm/README.md +++ b/docs/backends/trtllm/README.md @@ -196,7 +196,7 @@ NOTE: To send a request to a multi-node deployment, target the node which is run ### Benchmarking -To benchmark your deployment with GenAI-Perf, see this utility script, configuring the +To benchmark your deployment with AIPerf, see this utility script, configuring the `model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) @@ -236,7 +236,7 @@ NOTE: To send a request to a multi-node deployment, target the node which is run ## Benchmarking -To benchmark your deployment with GenAI-Perf, see this utility script, configuring the +To benchmark your deployment with AIPerf, see this utility script, configuring the `model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) ## Multimodal support diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 071b88bb2e2d..948f15780b6e 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ### Performance Testing with AIPerf -The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. +The Dynamo container includes [AIPerf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/aiperf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. **Run the following benchmark from inside the container** (after completing the deployment steps above): From b28b8bb38315d75aaff23fe51edb452650728a4b Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Thu, 16 Oct 2025 19:12:56 -0400 Subject: [PATCH 13/26] fix: Cherry-pick in last of aiperf replacements (#3683) Signed-off-by: lkomali Signed-off-by: Harrison Saturley-Hall Co-authored-by: Harshini Komali <157742537+lkomali@users.noreply.github.com> --- benchmarks/router/README.md | 2 +- benchmarks/router/prefix_ratio_benchmark.py | 55 +++++++++---------- benchmarks/router/real_data_benchmark.py | 30 +++++----- benchmarks/sin_load_generator/README.md | 2 +- benchmarks/utils/{genai.py => aiperf.py} | 0 benchmarks/utils/workflow.py | 2 +- docs/backends/trtllm/gpt-oss.md | 2 +- docs/benchmarks/benchmarking.md | 4 +- docs/guides/disagg_perf_tuning.md | 6 +- .../Distributed_Inference/README.md | 2 +- .../kubernetes/shared_frontend/README.md | 2 +- .../deployments/router_standalone/README.md | 2 +- .../deployments/router_standalone/perf.sh | 4 +- tests/planner/scaling/run_scaling_test.sh | 6 +- 14 files changed, 55 insertions(+), 64 deletions(-) rename benchmarks/utils/{genai.py => aiperf.py} (100%) diff --git a/benchmarks/router/README.md b/benchmarks/router/README.md index 40d8f127fd6b..207d27e4a57a 100644 --- a/benchmarks/router/README.md +++ b/benchmarks/router/README.md @@ -232,7 +232,7 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli > [!Note] > At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files: > ```bash -> pip install git+https://github.com/ai-dynamo/aiperf.git +> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf > ``` > However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is. diff --git a/benchmarks/router/prefix_ratio_benchmark.py b/benchmarks/router/prefix_ratio_benchmark.py index 5524bc37e522..4a1a1dd435c9 100755 --- a/benchmarks/router/prefix_ratio_benchmark.py +++ b/benchmarks/router/prefix_ratio_benchmark.py @@ -27,7 +27,7 @@ logger.addHandler(console_handler) -def get_genai_perf_cmd( +def get_aiperf_cmd( model, tokenizer, # Add tokenizer parameter prefix_ratio, @@ -40,12 +40,12 @@ def get_genai_perf_cmd( artifact_dir, url="http://localhost:8888", ): - """Build genai-perf command based on prefix ratio""" + """Build aiperf command based on prefix ratio""" prefix_length = int(isl * prefix_ratio) synthetic_input_length = int(isl * (1 - prefix_ratio)) return [ - "genai-perf", + "aiperf", "profile", "--model", model, @@ -84,10 +84,7 @@ def get_genai_perf_cmd( str(num_prefix_prompts), "--artifact-dir", artifact_dir, - "--", "-v", - "--max-threads", - "256", "-H", "Authorization: Bearer NOT USED", "-H", @@ -95,17 +92,17 @@ def get_genai_perf_cmd( ] -def get_gap_result(artifact_dir: str) -> dict: - """Parse genai-perf results from JSON file""" +def get_aiperf_result(artifact_dir: str) -> dict: + """Parse aiperf results from JSON file""" json_file_path = None for root, _, files in os.walk(artifact_dir): - if "profile_export_genai_perf.json" in files: - json_file_path = os.path.join(root, "profile_export_genai_perf.json") + if "profile_export_aiperf.json" in files: + json_file_path = os.path.join(root, "profile_export_aiperf.json") break if json_file_path is None: raise FileNotFoundError( - f"profile_export_genai_perf.json not found in {artifact_dir}" + f"profile_export_aiperf.json not found in {artifact_dir}" ) with open(json_file_path, "r") as f: @@ -125,8 +122,8 @@ def run_benchmark_single_url( artifact_dir, url, ) -> Optional[Dict]: - """Run genai-perf benchmark for a single URL""" - genai_perf_cmd = get_genai_perf_cmd( + """Run aiperf benchmark for a single URL""" + aiperf_cmd = get_aiperf_cmd( model, tokenizer, # Pass tokenizer parameter prefix_ratio, @@ -140,21 +137,21 @@ def run_benchmark_single_url( url, ) - logger.info(f"Running command for URL {url}: {' '.join(genai_perf_cmd)}") + logger.info(f"Running command for URL {url}: {' '.join(aiperf_cmd)}") try: - gap_process = subprocess.run( - genai_perf_cmd, capture_output=True, text=True, check=True + aiperf_process = subprocess.run( + aiperf_cmd, capture_output=True, text=True, check=True ) - logger.info(f"Genai-perf profiling completed successfully for URL {url}") - logger.info(gap_process.stdout) + logger.info(f"AIPerf profiling completed successfully for URL {url}") + logger.info(aiperf_process.stdout) - gap_result = get_gap_result(artifact_dir) - return gap_result + aiperf_result = get_aiperf_result(artifact_dir) + return aiperf_result except subprocess.CalledProcessError as e: - logger.error(f"Genai-perf failed for URL {url} with error code: {e.returncode}") + logger.error(f"AIPerf failed for URL {url} with error code: {e.returncode}") logger.error(f"stderr: {e.stderr}") return None @@ -197,7 +194,7 @@ def run_benchmark( output_dir, urls, ) -> Optional[Dict]: - """Run genai-perf benchmark for a specific prefix ratio""" + """Run aiperf benchmark for a specific prefix ratio""" logger.info( f"Running benchmark with prefix_ratio={prefix_ratio}, seed={seed}, URLs={urls}" ) @@ -242,7 +239,7 @@ def run_benchmark( os.makedirs(artifact_dir, exist_ok=True) artifact_dirs.append(artifact_dir) - genai_perf_cmd = get_genai_perf_cmd( + aiperf_cmd = get_aiperf_cmd( model, tokenizer, # Pass tokenizer parameter prefix_ratio, @@ -256,10 +253,10 @@ def run_benchmark( url, ) - logger.info(f"Launching process for URL {url}: {' '.join(genai_perf_cmd)}") + logger.info(f"Launching process for URL {url}: {' '.join(aiperf_cmd)}") process = subprocess.Popen( - genai_perf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) processes.append((process, url, artifact_dir)) @@ -269,18 +266,18 @@ def run_benchmark( stdout, stderr = process.communicate() if process.returncode == 0: - logger.info(f"Genai-perf completed successfully for URL {url}") + logger.info(f"AIPerf completed successfully for URL {url}") logger.info(stdout) try: - gap_result = get_gap_result(artifact_dir) - results.append(gap_result) + aiperf_result = get_aiperf_result(artifact_dir) + results.append(aiperf_result) except Exception as e: logger.error(f"Failed to get results for URL {url}: {e}") results.append(None) else: logger.error( - f"Genai-perf failed for URL {url} with error code: {process.returncode}" + f"AIPerf failed for URL {url} with error code: {process.returncode}" ) logger.error(f"stderr: {stderr}") results.append(None) diff --git a/benchmarks/router/real_data_benchmark.py b/benchmarks/router/real_data_benchmark.py index 34ceaf0558ef..a594b8467d86 100755 --- a/benchmarks/router/real_data_benchmark.py +++ b/benchmarks/router/real_data_benchmark.py @@ -24,7 +24,7 @@ logger.addHandler(console_handler) -def get_genai_perf_cmd_for_trace( +def get_aiperf_cmd_for_trace( model, tokenizer, input_dataset, @@ -33,7 +33,7 @@ def get_genai_perf_cmd_for_trace( url="http://localhost:8888", ): return [ - "genai-perf", + "aiperf", "profile", "--model", model, @@ -47,17 +47,13 @@ def get_genai_perf_cmd_for_trace( "--url", url, "--input-file", - f"payload:{input_dataset}", - "--fixed-schedule", - "True", + f"{input_dataset}", + "--fixed-schedule-auto-offset", "--random-seed", str(seed), "--artifact-dir", artifact_dir, - "--", "-v", - "--max-threads", - "256", "-H", "Authorization: Bearer NOT USED", "-H", @@ -73,8 +69,8 @@ def run_benchmark_with_trace( url, seed, ): - """Run genai-perf benchmark with a trace dataset""" - genai_perf_cmd = get_genai_perf_cmd_for_trace( + """Run aiperf benchmark with a trace dataset""" + aiperf_cmd = get_aiperf_cmd_for_trace( model, tokenizer, trace_dataset, @@ -83,17 +79,17 @@ def run_benchmark_with_trace( url, ) - logger.info(f"Running genai-perf with trace dataset: {trace_dataset}") - logger.info(f"Command: {' '.join(genai_perf_cmd)}") + logger.info(f"Running aiperf with trace dataset: {trace_dataset}") + logger.info(f"Command: {' '.join(aiperf_cmd)}") try: - # Run genai-perf and let it output directly to terminal - subprocess.run(genai_perf_cmd, check=True) + # Run aiperf and let it output directly to terminal + subprocess.run(aiperf_cmd, check=True) - logger.info("Genai-perf profiling completed successfully") + logger.info("AIPerf profiling completed successfully") except subprocess.CalledProcessError as e: - logger.error(f"Genai-perf failed with error code: {e.returncode}") + logger.error(f"AIPerf failed with error code: {e.returncode}") logger.error(f"stderr: {e.stderr}") raise @@ -301,7 +297,7 @@ def main(): logger.info(f"Synthetic trace data saved to: {trace_dataset_path}") # Run benchmark with the trace dataset - artifact_dir = os.path.join(args.output_dir, "genai_perf_artifacts") + artifact_dir = os.path.join(args.output_dir, "aiperf_artifacts") os.makedirs(artifact_dir, exist_ok=True) run_benchmark_with_trace( diff --git a/benchmarks/sin_load_generator/README.md b/benchmarks/sin_load_generator/README.md index 82b7dee5b9c5..f0b8e4741277 100644 --- a/benchmarks/sin_load_generator/README.md +++ b/benchmarks/sin_load_generator/README.md @@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0 # Sinusoidal Load Generator -`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf). +`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf). ## Usage diff --git a/benchmarks/utils/genai.py b/benchmarks/utils/aiperf.py similarity index 100% rename from benchmarks/utils/genai.py rename to benchmarks/utils/aiperf.py diff --git a/benchmarks/utils/workflow.py b/benchmarks/utils/workflow.py index fd939f5938e5..bb9e20fbe757 100644 --- a/benchmarks/utils/workflow.py +++ b/benchmarks/utils/workflow.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Dict, List -from benchmarks.utils.genai import run_concurrency_sweep +from benchmarks.utils.aiperf import run_concurrency_sweep from deploy.utils.kubernetes import is_running_in_cluster diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 948f15780b6e..0e4679a55408 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ### Performance Testing with AIPerf -The Dynamo container includes [AIPerf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/aiperf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. +The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. **Run the following benchmark from inside the container** (after completing the deployment steps above): diff --git a/docs/benchmarks/benchmarking.md b/docs/benchmarks/benchmarking.md index 8aebb9780fb6..f739360528b6 100644 --- a/docs/benchmarks/benchmarking.md +++ b/docs/benchmarks/benchmarking.md @@ -283,7 +283,7 @@ results/ # Client-side: ./benchmarks/results/ or custom │ └── avg_time_to_first_token_vs_concurrency.png ├── / # Results for your benchmark (uses your custom name) │ ├── c1/ # Concurrency level 1 -│ │ └── profile_export_genai_perf.json +│ │ └── profile_export_aiperf.json │ ├── c2/ # Concurrency level 2 │ ├── c5/ # Concurrency level 5 │ └── ... # Other concurrency levels (10, 50, 100, 250) @@ -457,7 +457,7 @@ Results are stored in `/data/results` and follow the same structure as client-si /data/results/ └── / # Results for your benchmark name ├── c1/ # Concurrency level 1 - │ └── profile_export_genai_perf.json + │ └── profile_export_aiperf.json ├── c2/ # Concurrency level 2 └── ... # Other concurrency levels ``` diff --git a/docs/guides/disagg_perf_tuning.md b/docs/guides/disagg_perf_tuning.md index d0fc03459d66..ab5a9e72d4d5 100644 --- a/docs/guides/disagg_perf_tuning.md +++ b/docs/guides/disagg_perf_tuning.md @@ -56,11 +56,11 @@ Typically, the number of GPUs vs the performance follows the following pattern: | 2 | 269 | 135 | 1.19x | | 4 | 578 | 144 | 1.28x | -The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) and compare with the SLA. -GenAI-Perf is pre-installed in the dynamo container. +The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main) and compare with the SLA. +AIPerf is pre-installed in the dynamo container. > [!Tip] -> If you are unfamiliar with GenAI-Perf, please see this helpful [tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md) to get you started. +> If you are unfamiliar with AIPerf, please see this helpful [tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorial.md) to get you started. Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size. For prefill engines, usually a small batch size and large `max_num_token` is preferred. diff --git a/examples/basics/kubernetes/Distributed_Inference/README.md b/examples/basics/kubernetes/Distributed_Inference/README.md index 1a1515608b6e..2758b3d2388f 100644 --- a/examples/basics/kubernetes/Distributed_Inference/README.md +++ b/examples/basics/kubernetes/Distributed_Inference/README.md @@ -54,4 +54,4 @@ curl localhost:8000/v1/chat/completions \ "max_tokens": 30 }' ``` -You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html) +You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md) diff --git a/examples/basics/kubernetes/shared_frontend/README.md b/examples/basics/kubernetes/shared_frontend/README.md index 712342510c7a..e692001a7d48 100644 --- a/examples/basics/kubernetes/shared_frontend/README.md +++ b/examples/basics/kubernetes/shared_frontend/README.md @@ -39,4 +39,4 @@ curl localhost:8000/v1/chat/completions \ "max_tokens": 30 }' ``` -You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html) +You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md) diff --git a/examples/deployments/router_standalone/README.md b/examples/deployments/router_standalone/README.md index 339b07e7d5f8..1fcb6f19401a 100644 --- a/examples/deployments/router_standalone/README.md +++ b/examples/deployments/router_standalone/README.md @@ -80,7 +80,7 @@ While not implemented in this example, the router can also operate in a pure pre - Integrates with vLLM's OpenAI serving components for request preprocessing and response formatting ### `perf.sh` -- Benchmarking script using `genai-perf` to test the router setup +- Benchmarking script using `aiperf` to test the router setup - Configured for streaming chat completions with synthetic workloads - Tests concurrent requests to evaluate routing performance diff --git a/examples/deployments/router_standalone/perf.sh b/examples/deployments/router_standalone/perf.sh index ee59fad45cae..9c5df0c58709 100644 --- a/examples/deployments/router_standalone/perf.sh +++ b/examples/deployments/router_standalone/perf.sh @@ -28,7 +28,7 @@ num_unique_prompts=10 seed=42 -genai-perf profile \ +aiperf profile \ --model ${model} \ --tokenizer ${model} \ --endpoint-type ${type} \ @@ -47,8 +47,6 @@ genai-perf profile \ --request-count ${num_requests} \ --num-dataset-entries ${num_unique_prompts} \ --random-seed ${seed} \ - -- \ -v \ - --max-threads 256 \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' diff --git a/tests/planner/scaling/run_scaling_test.sh b/tests/planner/scaling/run_scaling_test.sh index 4c0064f951c1..ce1077d8a9ec 100755 --- a/tests/planner/scaling/run_scaling_test.sh +++ b/tests/planner/scaling/run_scaling_test.sh @@ -64,9 +64,9 @@ check_prerequisites() { exit 1 fi - # Check for genai-perf - if ! command -v genai-perf &> /dev/null; then - log_error "genai-perf not found. This tool is required for load generation." + # Check for aiperf + if ! command -v aiperf &> /dev/null; then + log_error "aiperf not found. This tool is required for load generation." log_error "Please install the required dependencies by following the instructions in tests/planner/README.md" exit 1 fi From c55f34afcefea4f671cadb21886ac6c4034eb159 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 16 Oct 2025 16:19:05 -0700 Subject: [PATCH 14/26] fix: Reduce memory usage to avoid vLLM dsr1 OOM (#3660) (#3661) --- components/backends/vllm/launch/dsr1_dep.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/backends/vllm/launch/dsr1_dep.sh b/components/backends/vllm/launch/dsr1_dep.sh index 4a31a157708a..4d8c30332387 100755 --- a/components/backends/vllm/launch/dsr1_dep.sh +++ b/components/backends/vllm/launch/dsr1_dep.sh @@ -101,10 +101,10 @@ for ((i=0; i&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}.log & done From b08e97b550f9d58b9bac6f26dcf44352ab341731 Mon Sep 17 00:00:00 2001 From: ishandhanani <82981111+ishandhanani@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:36:52 -0700 Subject: [PATCH 15/26] fix: cherry pick sglang bump + fix k8s yamls (#3708) --- components/backends/sglang/deploy/disagg-multinode.yaml | 4 ++++ components/backends/sglang/deploy/disagg.yaml | 9 ++++++++- components/backends/sglang/deploy/disagg_planner.yaml | 8 ++++++++ container/Dockerfile.sglang | 2 +- container/Dockerfile.sglang-wideep | 2 +- docs/backends/sglang/README.md | 4 ++-- docs/backends/sglang/dsr1-wideep-h100.md | 2 ++ docs/backends/sglang/multinode-examples.md | 4 ++++ pyproject.toml | 2 +- .../deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml | 4 +++- .../deepseek-r1/sglang-wideep/tep8p-dep8d-disagg.yaml | 4 +++- 11 files changed, 37 insertions(+), 8 deletions(-) diff --git a/components/backends/sglang/deploy/disagg-multinode.yaml b/components/backends/sglang/deploy/disagg-multinode.yaml index 15be05a8a541..b408b5806acd 100644 --- a/components/backends/sglang/deploy/disagg-multinode.yaml +++ b/components/backends/sglang/deploy/disagg-multinode.yaml @@ -56,6 +56,8 @@ spec: - nixl - --disaggregation-bootstrap-port - "30001" + - --host + - "0.0.0.0" - --mem-fraction-static - "0.82" prefill: @@ -93,3 +95,5 @@ spec: - "30001" - --mem-fraction-static - "0.82" + - --host + - "0.0.0.0" \ No newline at end of file diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml index a1fbf5ab1805..caa19057b132 100644 --- a/components/backends/sglang/deploy/disagg.yaml +++ b/components/backends/sglang/deploy/disagg.yaml @@ -46,7 +46,10 @@ spec: - decode - --disaggregation-transfer-backend - nixl - + - --disaggregation-bootstrap-port + - "12345" + - --host + - "0.0.0.0" prefill: envFromSecret: hf-token-secret dynamoNamespace: sglang-disagg @@ -79,3 +82,7 @@ spec: - prefill - --disaggregation-transfer-backend - nixl + - --disaggregation-bootstrap-port + - "12345" + - --host + - "0.0.0.0" \ No newline at end of file diff --git a/components/backends/sglang/deploy/disagg_planner.yaml b/components/backends/sglang/deploy/disagg_planner.yaml index 835373fec4e0..341cee93b6c1 100644 --- a/components/backends/sglang/deploy/disagg_planner.yaml +++ b/components/backends/sglang/deploy/disagg_planner.yaml @@ -70,6 +70,10 @@ spec: - decode - --disaggregation-transfer-backend - nixl + - --disaggregation-bootstrap-port + - "12345" + - --host + - "0.0.0.0" prefill: dynamoNamespace: dynamo envFromSecret: hf-token-secret @@ -102,3 +106,7 @@ spec: - prefill - --disaggregation-transfer-backend - nixl + - --disaggregation-bootstrap-port + - "12345" + - --host + - "0.0.0.0" diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index 92bb8c03ce8f..03a655bfe57f 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -14,7 +14,7 @@ ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" # Make sure to update the dependency version in pyproject.toml when updating this -ARG SGLANG_VERSION="0.5.3.post1" +ARG SGLANG_VERSION="0.5.3.post2" # Define general architecture ARGs for supporting both x86 and aarch64 builds. diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep index 3313929e1616..8d08dad0f817 100644 --- a/container/Dockerfile.sglang-wideep +++ b/container/Dockerfile.sglang-wideep @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -ARG SGLANG_IMAGE_TAG="v0.5.3.post1" +ARG SGLANG_IMAGE_TAG="v0.5.3.post2" ARG BRANCH_TYPE FROM scratch AS local_src diff --git a/docs/backends/sglang/README.md b/docs/backends/sglang/README.md index 4697b0797cd4..6604edb5ee81 100644 --- a/docs/backends/sglang/README.md +++ b/docs/backends/sglang/README.md @@ -104,8 +104,8 @@ cd $DYNAMO_HOME # installs sglang supported version along with dynamo # include the prerelease flag to install flashinfer rc versions uv pip install -e . -# install any sglang version >= 0.5.3 -uv pip install "sglang[all]==0.5.3.post1" +# install any sglang version >= 0.5.3.post2 +uv pip install "sglang[all]==0.5.3.post2" ``` diff --git a/docs/backends/sglang/dsr1-wideep-h100.md b/docs/backends/sglang/dsr1-wideep-h100.md index a0f2f83ba32a..0bff93c6072e 100644 --- a/docs/backends/sglang/dsr1-wideep-h100.md +++ b/docs/backends/sglang/dsr1-wideep-h100.md @@ -58,6 +58,7 @@ python3 -m dynamo.sglang \ --skip-tokenizer-init \ --disaggregation-mode prefill \ --disaggregation-transfer-backend nixl \ + --host 0.0.0.0 \ --disaggregation-bootstrap-port 30001 \ --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \ --nnodes 4 \ @@ -95,6 +96,7 @@ python3 -m dynamo.sglang \ --disaggregation-mode decode \ --disaggregation-transfer-backend nixl \ --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \ --nnodes 4 \ --node-rank 0 \ diff --git a/docs/backends/sglang/multinode-examples.md b/docs/backends/sglang/multinode-examples.md index be78f261f0f7..44b618db3ea5 100644 --- a/docs/backends/sglang/multinode-examples.md +++ b/docs/backends/sglang/multinode-examples.md @@ -39,6 +39,7 @@ python3 -m dynamo.sglang \ --disaggregation-mode prefill \ --disaggregation-transfer-backend nixl \ --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ --mem-fraction-static 0.82 ``` @@ -58,6 +59,7 @@ python3 -m dynamo.sglang \ --disaggregation-mode prefill \ --disaggregation-transfer-backend nixl \ --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ --mem-fraction-static 0.82 ``` @@ -77,6 +79,7 @@ python3 -m dynamo.sglang \ --disaggregation-mode decode \ --disaggregation-transfer-backend nixl \ --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ --mem-fraction-static 0.82 ``` @@ -96,6 +99,7 @@ python3 -m dynamo.sglang \ --disaggregation-mode decode \ --disaggregation-transfer-backend nixl \ --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ --mem-fraction-static 0.82 ``` diff --git a/pyproject.toml b/pyproject.toml index 61e2c32a043c..ee4208c98e23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ vllm = [ sglang = [ "uvloop", "nixl<=0.6.0", - "sglang[all]==0.5.3.post1", + "sglang[all]==0.5.3.post2", ] [dependency-groups] diff --git a/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml b/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml index 2c1bfb01c602..433bdb7645d6 100644 --- a/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml +++ b/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml @@ -67,6 +67,7 @@ spec: --disaggregation-transfer-backend nixl --disaggregation-bootstrap-port 30001 --mem-fraction-static 0.8 + --host 0.0.0.0 prefill: dynamoNamespace: sgl-dsr1-16gpu componentType: worker @@ -107,4 +108,5 @@ spec: --disaggregation-mode prefill --disaggregation-transfer-backend nixl --disaggregation-bootstrap-port 30001 - --mem-fraction-static 0.8 \ No newline at end of file + --mem-fraction-static 0.8 + --host 0.0.0.0 \ No newline at end of file diff --git a/recipes/deepseek-r1/sglang-wideep/tep8p-dep8d-disagg.yaml b/recipes/deepseek-r1/sglang-wideep/tep8p-dep8d-disagg.yaml index d6f20e5cd6e4..96ffbc680a81 100644 --- a/recipes/deepseek-r1/sglang-wideep/tep8p-dep8d-disagg.yaml +++ b/recipes/deepseek-r1/sglang-wideep/tep8p-dep8d-disagg.yaml @@ -64,6 +64,7 @@ spec: --disaggregation-mode decode --disaggregation-transfer-backend nixl --disaggregation-bootstrap-port 30001 + --host 0.0.0.0 prefill: dynamoNamespace: sgl-dsr1-8gpu componentType: worker @@ -101,4 +102,5 @@ spec: --skip-tokenizer-init --disaggregation-mode prefill --disaggregation-transfer-backend nixl - --disaggregation-bootstrap-port 30001 \ No newline at end of file + --disaggregation-bootstrap-port 30001 + --host 0.0.0.0 \ No newline at end of file From 249c21aa0585f4d5e3e0ef0e25fa05ae8b07b497 Mon Sep 17 00:00:00 2001 From: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:41:46 -0700 Subject: [PATCH 16/26] =?UTF-8?q?fix:=20json=20strings=20should=20remain?= =?UTF-8?q?=20intact=20through=20profiler=20arg=20processin=E2=80=A6=20(#3?= =?UTF-8?q?689)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Hannah Zhang --- benchmarks/profiler/utils/config.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py index 426dc80a6238..065f73453d6e 100644 --- a/benchmarks/profiler/utils/config.py +++ b/benchmarks/profiler/utils/config.py @@ -130,8 +130,17 @@ def break_arguments(args: list[str] | None) -> list[str]: else: for arg in args: if arg is not None: - # Use shlex.split to properly handle quoted arguments - ans.extend(shlex.split(arg)) + # If the arg looks like it might be JSON (starts with { or [) or is already a single token, + # don't split it further. Only split if it contains spaces AND doesn't look like JSON. + if ( + isinstance(arg, str) + and (" " in arg or "\t" in arg) + and not (arg.strip().startswith(("{", "["))) + ): + # Use shlex.split to properly handle quoted arguments + ans.extend(shlex.split(arg)) + else: + ans.append(arg) return ans @@ -971,7 +980,9 @@ def convert_config( # - Disable enable_block_reuse (no KV reuse for prefill-only) # - Enable overlap scheduler (disabled in prefill.yaml but needed for agg) # - Remove cache_transceiver_config (not needed in agg mode) - if "kv_cache_config" not in override_dict: + if "kv_cache_config" not in override_dict or not isinstance( + override_dict["kv_cache_config"], dict + ): override_dict["kv_cache_config"] = {} override_dict["kv_cache_config"]["enable_block_reuse"] = False override_dict[ @@ -1022,7 +1033,9 @@ def convert_config( # Merge our overrides for converting decode-only disagg to aggregated: # - Enable enable_block_reuse (to skip prefill in decode-only) # - Remove cache_transceiver_config (not needed in agg mode) - if "kv_cache_config" not in override_dict: + if "kv_cache_config" not in override_dict or not isinstance( + override_dict["kv_cache_config"], dict + ): override_dict["kv_cache_config"] = {} override_dict["kv_cache_config"]["enable_block_reuse"] = True override_dict[ From 8bc9f2f6f8cbd9ee4dfeb8b8e65e7a08db419bb8 Mon Sep 17 00:00:00 2001 From: Yan Ru Pei Date: Fri, 17 Oct 2025 12:42:46 -0700 Subject: [PATCH 17/26] feat: (cherrypick) custom distributed rw lock for radix snapshotting and downloading (#3692) Signed-off-by: PeaBrane --- lib/llm/src/kv_router/subscriber.rs | 194 +++++------- lib/runtime/src/transports/etcd.rs | 2 + lib/runtime/src/transports/etcd/lock.rs | 396 ++++++++++++++++++++++++ 3 files changed, 483 insertions(+), 109 deletions(-) create mode 100644 lib/runtime/src/transports/etcd/lock.rs diff --git a/lib/llm/src/kv_router/subscriber.rs b/lib/llm/src/kv_router/subscriber.rs index 6f47e7e0467b..29f7f39c42fa 100644 --- a/lib/llm/src/kv_router/subscriber.rs +++ b/lib/llm/src/kv_router/subscriber.rs @@ -11,7 +11,7 @@ use dynamo_runtime::{ prelude::*, traits::events::EventPublisher, transports::{ - etcd::{Client as EtcdClient, WatchEvent}, + etcd::{Client as EtcdClient, DistributedRWLock, WatchEvent}, nats::{NatsQueue, Slug}, }, }; @@ -32,47 +32,27 @@ use crate::{ struct SnapshotResources { nats_client: dynamo_runtime::transports::nats::Client, bucket_name: String, - lock_name: String, + rwlock: DistributedRWLock, instances_rx: tokio::sync::watch::Receiver>, get_workers_tx: mpsc::Sender, snapshot_tx: mpsc::Sender, } impl SnapshotResources { - /// Try to acquire distributed lock for snapshot operations - /// Returns Some(lock_response) if lock acquired, None if another instance holds it - async fn lock(&self, etcd_client: &EtcdClient) -> Option { - match etcd_client - .lock(self.lock_name.clone(), Some(etcd_client.lease_id())) - .await - { - Ok(response) => { - tracing::debug!( - "Successfully acquired snapshot lock with key: {:?}", - response.key() - ); - Some(response) - } - Err(e) => { - tracing::debug!("Another instance already holds the snapshot lock: {e:?}"); - None - } - } - } - - /// Release the distributed lock - async fn unlock(&self, etcd_client: &EtcdClient, lock_response: etcd_client::LockResponse) { - if let Err(e) = etcd_client.unlock(lock_response.key()).await { - tracing::warn!("Failed to release snapshot lock: {e:?}"); - } - } - - /// Perform snapshot upload and purge operations + /// Perform snapshot upload and purge operations with write lock async fn purge_then_snapshot( &self, + etcd_client: &EtcdClient, nats_queue: &mut NatsQueue, remove_worker_tx: &mpsc::Sender, ) -> anyhow::Result<()> { + // Try to acquire write lock (non-blocking) + let Some(_write_guard) = self.rwlock.try_write_lock(etcd_client).await else { + tracing::debug!( + "Could not acquire write lock for snapshot (readers active or lock held)" + ); + anyhow::bail!("Write lock unavailable"); + }; // Purge before snapshot ensures new/warm-restarted routers won't replay already-acknowledged messages. // Since KV events are idempotent, this ordering reduces unnecessary reprocessing while maintaining // at-least-once delivery guarantees. The snapshot will capture the clean state after purge. @@ -100,13 +80,11 @@ impl SnapshotResources { for worker_id in indexer_worker_ids { if !current_worker_ids.contains(&worker_id) { tracing::info!( - "Removing stale worker {} from indexer during snapshot", - worker_id + "Removing stale worker {worker_id} from indexer during snapshot" ); if let Err(e) = remove_worker_tx.send(worker_id).await { tracing::warn!( - "Failed to send remove_worker for stale worker {}: {e:?}", - worker_id + "Failed to send remove_worker for stale worker {worker_id}: {e:?}" ); } } @@ -193,11 +171,21 @@ pub async fn start_kv_router_background( .build()?; let nats_client = client_options.connect().await?; + // Get etcd client (needed for both snapshots and router watching) + let etcd_client = component + .drt() + .etcd_client() + .ok_or_else(|| anyhow::anyhow!("etcd client not available"))?; + // Create bucket name for snapshots/state let bucket_name = Slug::slugify(&format!("{}-{RADIX_STATE_BUCKET}", component.subject())) .to_string() .replace("_", "-"); + // Create RWLock for snapshot coordination + let lock_prefix = format!("{}/{}", ROUTER_SNAPSHOT_LOCK, component.subject()); + let snapshot_rwlock = DistributedRWLock::new(lock_prefix); + // Handle initial state based on router_reset_states flag if router_reset_states { // Delete the bucket to reset state @@ -206,43 +194,48 @@ pub async fn start_kv_router_background( tracing::warn!("Failed to delete bucket (may not exist): {e:?}"); } } else { - // Try to download initial state from object store + // Try to download initial state from object store with read lock let url = url::Url::parse(&format!( "nats://{}/{bucket_name}/{RADIX_STATE_FILE}", nats_client.addr() ))?; - match nats_client - .object_store_download_data::>(&url) + // Acquire read lock with default timeout + if let Ok(_read_guard) = snapshot_rwlock + .read_lock_with_wait(&etcd_client, &consumer_uuid, None) .await { - Ok(events) => { - tracing::info!( - "Successfully downloaded {} events from object store", - events.len() - ); - // Send all events to the indexer - for event in events { - if let Err(e) = kv_events_tx.send(event).await { - tracing::warn!("Failed to send initial event to indexer: {e:?}"); + tracing::debug!("Acquired read lock for snapshot download"); + + // Download snapshot while holding read lock + match nats_client + .object_store_download_data::>(&url) + .await + { + Ok(events) => { + tracing::info!( + "Successfully downloaded {} events from object store", + events.len() + ); + // Send all events to the indexer + for event in events { + if let Err(e) = kv_events_tx.send(event).await { + tracing::warn!("Failed to send initial event to indexer: {e:?}"); + } } + tracing::info!("Successfully sent all initial events to indexer"); + } + Err(e) => { + tracing::info!( + "Did not initialize radix state from NATS object store (likely no snapshots yet): {e:?}" + ); } - tracing::info!("Successfully sent all initial events to indexer"); - } - Err(e) => { - tracing::info!( - "Did not initialize radix state from NATs object store (likely no snapshots yet): {e:?}" - ); } + } else { + tracing::warn!("Could not acquire read lock for snapshot download (timeout or error)"); } } - // Get etcd client (needed for both snapshots and router watching) - let etcd_client = component - .drt() - .etcd_client() - .ok_or_else(|| anyhow::anyhow!("etcd client not available"))?; - // Cleanup orphaned consumers on startup cleanup_orphaned_consumers(&mut nats_queue, &etcd_client, &component, &consumer_uuid).await; @@ -251,7 +244,6 @@ pub async fn start_kv_router_background( .kv_get_and_watch_prefix(&format!("{}/", KV_ROUTERS_ROOT_PATH)) .await? .dissolve(); - let cleanup_lock_name = format!("{}/{}", ROUTER_CLEANUP_LOCK, component.subject()); // Get the generate endpoint and watch for instance deletions let generate_endpoint = component.endpoint("generate"); @@ -275,12 +267,10 @@ pub async fn start_kv_router_background( maybe_snapshot_tx, router_snapshot_threshold, ) { - let lock_name = format!("{}/{}", ROUTER_SNAPSHOT_LOCK, component.subject()); - Some(SnapshotResources { nats_client, bucket_name, - lock_name, + rwlock: snapshot_rwlock.clone(), instances_rx, get_workers_tx, snapshot_tx, @@ -317,19 +307,19 @@ pub async fn start_kv_router_background( // Extract the hex worker ID after the colon (e.g., "generate:694d99badb9f7c07" -> "694d99badb9f7c07") let Some(worker_id_str) = key.split(':').next_back() else { - tracing::warn!("Could not extract worker ID from instance key: {}", key); + tracing::warn!("Could not extract worker ID from instance key: {key}"); continue; }; // Parse as hexadecimal (base 16) let Ok(worker_id) = i64::from_str_radix(worker_id_str, 16) else { - tracing::warn!("Could not parse worker ID from instance key: {}", key); + tracing::warn!("Could not parse worker ID from instance key: {key}"); continue; }; - tracing::info!("Generate endpoint instance deleted, removing worker {}", worker_id); + tracing::info!("Generate endpoint instance deleted, removing worker {worker_id}"); if let Err(e) = remove_worker_tx.send(worker_id).await { - tracing::warn!("Failed to send worker removal for worker {}: {}", worker_id, e); + tracing::warn!("Failed to send worker removal for worker {worker_id}: {e}"); } } @@ -381,24 +371,17 @@ pub async fn start_kv_router_background( continue; } - tracing::info!("Stream has {message_count} messages, attempting to acquire lock for purge and snapshot"); + tracing::info!("Stream has {message_count} messages, attempting to acquire write lock for purge and snapshot"); - // Try to acquire distributed lock - let Some(lock_response) = resources.lock(&etcd_client).await else { - continue; - }; - - // Perform snapshot upload and purge + // Perform snapshot upload and purge (acquires write lock internally) match resources.purge_then_snapshot( + &etcd_client, &mut nats_queue, &remove_worker_tx, ).await { Ok(_) => tracing::info!("Successfully performed purge and snapshot"), - Err(e) => tracing::error!("Failed to perform purge and snapshot: {e:?}"), + Err(e) => tracing::debug!("Could not perform purge and snapshot: {e:?}"), } - - // Release the lock - resources.unlock(&etcd_client, lock_response).await; } // Handle router deletion events @@ -409,7 +392,7 @@ pub async fn start_kv_router_background( }; let key = String::from_utf8_lossy(kv.key()); - tracing::info!("Detected router replica deletion: {}", key); + tracing::info!("Detected router replica deletion: {key}"); // Only process deletions for routers on the same component if !key.contains(component.path().as_str()) { @@ -422,44 +405,37 @@ pub async fn start_kv_router_background( // Extract the router UUID from the key let Some(router_uuid) = key.split('/').next_back() else { - tracing::warn!("Could not extract UUID from router key: {}", key); + tracing::warn!("Could not extract UUID from router key: {key}"); continue; }; // The consumer UUID is the router UUID let consumer_to_delete = router_uuid.to_string(); - tracing::info!("Attempting to delete orphaned consumer: {}", consumer_to_delete); - - // Try to acquire cleanup lock before deleting consumer - match etcd_client - .lock(cleanup_lock_name.clone(), Some(etcd_client.lease_id())) - .await - { - Ok(lock_response) => { - tracing::debug!( - "Acquired cleanup lock for deleting consumer: {}", - consumer_to_delete - ); + tracing::info!("Attempting to delete orphaned consumer: {consumer_to_delete}"); - // Delete the consumer - if let Err(e) = nats_queue.shutdown(Some(consumer_to_delete.clone())).await { - tracing::warn!("Failed to delete consumer {}: {}", consumer_to_delete, e); - } else { - tracing::info!("Successfully deleted orphaned consumer: {}", consumer_to_delete); - } + // Create a unique cleanup lock for this specific consumer + let cleanup_lock_name = format!("{}/{}/{}", ROUTER_CLEANUP_LOCK, component.subject(), consumer_to_delete); + let cleanup_rwlock = DistributedRWLock::new(cleanup_lock_name); - // Release the lock - if let Err(e) = etcd_client.unlock(lock_response.key()).await { - tracing::warn!("Failed to release cleanup lock: {e:?}"); - } - } - Err(e) => { - tracing::debug!( - "Could not acquire cleanup lock for consumer {}: {e:?}", - consumer_to_delete - ); + // Try to acquire cleanup write lock (non-blocking) before deleting consumer + if let Some(_cleanup_guard) = cleanup_rwlock.try_write_lock(&etcd_client).await { + tracing::debug!( + "Acquired cleanup lock for deleting consumer: {consumer_to_delete}" + ); + + // Delete the consumer + if let Err(e) = nats_queue.shutdown(Some(consumer_to_delete.clone())).await { + tracing::warn!("Failed to delete consumer {consumer_to_delete}: {e}"); + } else { + tracing::info!("Successfully deleted orphaned consumer: {consumer_to_delete}"); } + + // Cleanup lock is automatically released when _cleanup_guard goes out of scope + } else { + tracing::debug!( + "Could not acquire cleanup lock for consumer {consumer_to_delete}" + ); } } } @@ -506,7 +482,7 @@ async fn cleanup_orphaned_consumers( continue; } if !active_uuids.contains(&consumer) { - tracing::info!("Cleaning up orphaned consumer: {}", consumer); + tracing::info!("Cleaning up orphaned consumer: {consumer}"); let _ = nats_queue.shutdown(Some(consumer)).await; } } diff --git a/lib/runtime/src/transports/etcd.rs b/lib/runtime/src/transports/etcd.rs index a8da7703b07a..6fb17c07249a 100644 --- a/lib/runtime/src/transports/etcd.rs +++ b/lib/runtime/src/transports/etcd.rs @@ -21,9 +21,11 @@ pub use etcd_client::{ConnectOptions, KeyValue, LeaseClient}; use tokio::time::{Duration, interval}; mod lease; +mod lock; mod path; use lease::*; +pub use lock::*; pub use path::*; use super::utils::build_in_runtime; diff --git a/lib/runtime/src/transports/etcd/lock.rs b/lib/runtime/src/transports/etcd/lock.rs new file mode 100644 index 000000000000..e017c35cabd2 --- /dev/null +++ b/lib/runtime/src/transports/etcd/lock.rs @@ -0,0 +1,396 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Distributed read-write lock implementation using etcd atomic transactions + +use std::time::Duration; + +use etcd_client::{Compare, CompareOp, PutOptions, Txn, TxnOp}; + +use crate::Result; + +use super::Client; + +/// Timeout for acquiring read lock when downloading snapshots +const DEFAULT_READ_LOCK_TIMEOUT_SECS: u64 = 30; + +/// Distributed read-write lock for coordinating operations across multiple processes +/// +/// This implementation uses etcd atomic transactions to prevent race conditions: +/// - Write locks are exclusive (no readers or writers can coexist) +/// - Read locks are shared (multiple readers allowed, but no writers) +/// - All lock operations use atomic compare-and-set to ensure correctness +/// - Locks are bound to leases for automatic cleanup on client failure +#[derive(Clone)] +pub struct DistributedRWLock { + lock_prefix: String, +} + +pub struct WriteLockGuard<'a> { + rwlock: &'a DistributedRWLock, + etcd_client: &'a Client, +} + +impl Drop for WriteLockGuard<'_> { + fn drop(&mut self) { + match tokio::runtime::Handle::try_current() { + Ok(handle) => { + let rwlock = self.rwlock.clone(); + let etcd_client = self.etcd_client.clone(); + handle.spawn(async move { + let write_key = format!("v1/{}/writer", rwlock.lock_prefix); + if let Err(e) = etcd_client.kv_delete(write_key.as_str(), None).await { + tracing::warn!("Failed to release write lock in drop: {e:?}"); + } + }); + } + Err(_) => { + tracing::error!( + "WriteLockGuard dropped outside tokio runtime - lock not released! \ + Lock will be cleaned up when etcd lease expires." + ); + } + } + } +} + +pub struct ReadLockGuard<'a> { + rwlock: &'a DistributedRWLock, + etcd_client: &'a Client, + reader_id: String, +} + +impl Drop for ReadLockGuard<'_> { + fn drop(&mut self) { + match tokio::runtime::Handle::try_current() { + Ok(handle) => { + let rwlock = self.rwlock.clone(); + let etcd_client = self.etcd_client.clone(); + let reader_id = self.reader_id.clone(); + handle.spawn(async move { + let reader_key = format!("v1/{}/readers/{reader_id}", rwlock.lock_prefix); + if let Err(e) = etcd_client.kv_delete(reader_key.as_str(), None).await { + tracing::warn!("Failed to release read lock in drop: {e:?}"); + } + }); + } + Err(_) => { + tracing::error!( + "ReadLockGuard dropped outside tokio runtime - lock not released! \ + Lock will be cleaned up when etcd lease expires." + ); + } + } + } +} + +impl DistributedRWLock { + /// Create a new distributed RWLock with the given prefix + /// + /// The lock will create keys under: + /// - `v1/{prefix}/writer` for the write lock + /// - `v1/{prefix}/readers/{reader_id}` for read locks + pub fn new(lock_prefix: String) -> Self { + Self { lock_prefix } + } + + /// Try to acquire exclusive write lock (non-blocking) + /// + /// Returns `Some(WriteLockGuard)` if acquired, `None` if readers exist or lock unavailable. + /// The guard automatically releases the lock when dropped. + /// + /// Implementation strategy: + /// 1. Atomically create writer key if it doesn't exist + /// 2. Immediately check if any readers exist + /// 3. If readers found, rollback (delete writer key) and return None + /// + /// Note: There is still a small race window (sub-millisecond) where a reader could acquire + /// a lock between steps 2-3. + pub async fn try_write_lock<'a>( + &'a self, + etcd_client: &'a Client, + ) -> Option> { + let write_key = format!("v1/{}/writer", self.lock_prefix); + let lease_id = etcd_client.lease_id(); + let put_options = PutOptions::new().with_lease(lease_id); + + // Step 1: Atomically create write lock only if it doesn't exist + let txn = Txn::new() + .when(vec![Compare::version( + write_key.as_str(), + CompareOp::Equal, + 0, + )]) + .and_then(vec![TxnOp::put( + write_key.as_str(), + b"writing", + Some(put_options), + )]); + + // Execute the atomic transaction + match etcd_client.etcd_client().kv_client().txn(txn).await { + Ok(response) if response.succeeded() => { + // Step 2: Immediately check if any readers exist + let reader_prefix = format!("v1/{}/readers/", self.lock_prefix); + match etcd_client.kv_get_prefix(&reader_prefix).await { + Ok(readers) if !readers.is_empty() => { + // Readers exist! Rollback - delete our writer key + tracing::debug!( + "Found {} reader(s) after acquiring write lock, rolling back", + readers.len() + ); + if let Err(e) = etcd_client.kv_delete(write_key.as_str(), None).await { + tracing::warn!("Failed to rollback write lock: {e:?}"); + } + None + } + Ok(_) => { + // No readers, we successfully hold the write lock + tracing::debug!("Successfully acquired write lock with no readers"); + Some(WriteLockGuard { + rwlock: self, + etcd_client, + }) + } + Err(e) => { + // Error checking for readers - rollback to be safe + tracing::warn!( + "Failed to check for readers, rolling back write lock: {e:?}" + ); + let _ = etcd_client.kv_delete(write_key.as_str(), None).await; + None + } + } + } + Ok(_) => { + tracing::debug!("Write lock already exists, transaction failed"); + None + } + Err(e) => { + tracing::warn!("Failed to execute write lock transaction: {e:?}"); + None + } + } + } + + /// Acquire shared read lock with polling retry + /// + /// Polls every 100ms until write lock is released, then atomically acquires read lock. + /// The guard automatically releases the lock when dropped. + /// Uses atomic transaction to prevent race with writer - the check for no write lock + /// and creation of read lock happen in a single atomic operation. + /// + /// # Arguments + /// * `etcd_client` - The etcd client + /// * `reader_id` - Unique identifier for this reader + /// * `timeout` - Optional timeout, defaults to 5 seconds + pub async fn read_lock_with_wait<'a>( + &'a self, + etcd_client: &'a Client, + reader_id: &str, + timeout: Option, + ) -> Result> { + let timeout = timeout.unwrap_or(Duration::from_secs(DEFAULT_READ_LOCK_TIMEOUT_SECS)); + let write_key = format!("v1/{}/writer", self.lock_prefix); + let reader_key = format!("v1/{}/readers/{reader_id}", self.lock_prefix); + let deadline = tokio::time::Instant::now() + timeout; + let lease_id = etcd_client.lease_id(); + + loop { + // Check if timeout exceeded + if tokio::time::Instant::now() > deadline { + anyhow::bail!("Timeout waiting for read lock after {:?}", timeout); + } + + // Try to atomically acquire read lock + // The transaction checks that no writer exists and creates reader key atomically + let put_options = PutOptions::new().with_lease(lease_id); + + // Build atomic transaction: create reader key only if write_key doesn't exist + let txn = Txn::new() + .when(vec![Compare::version( + write_key.as_str(), + CompareOp::Equal, + 0, + )]) + .and_then(vec![TxnOp::put( + reader_key.as_str(), + b"reading", + Some(put_options), + )]); + + // Execute the atomic transaction + match etcd_client.etcd_client().kv_client().txn(txn).await { + Ok(response) if response.succeeded() => { + tracing::debug!("Acquired read lock for reader {}", reader_id); + return Ok(ReadLockGuard { + rwlock: self, + etcd_client, + reader_id: reader_id.to_string(), + }); + } + Ok(_) => { + tracing::trace!("Write lock exists or was created, retrying after delay"); + } + Err(e) => { + tracing::warn!("Failed to execute read lock transaction: {e:?}"); + } + } + + // Wait before next retry + tokio::time::sleep(Duration::from_millis(100)).await; + } + } +} + +#[cfg(feature = "testing-etcd")] +#[cfg(test)] +mod tests { + use super::*; + use crate::Runtime; + use std::sync::Arc; + use tokio::sync::Barrier; + + /// Test the DistributedRWLock behavior + /// + /// This test verifies: + /// 1. Multiple readers can acquire read locks simultaneously + /// 2. Write lock fails when readers are active + /// 3. Write lock succeeds when no locks are held + /// 4. Read lock waits for write lock to be released + #[tokio::test] + async fn test_distributed_rwlock() { + // Setup: Create etcd client + let runtime = Runtime::from_settings().unwrap(); + let etcd_client = Client::builder() + .etcd_url(vec!["http://localhost:2379".to_string()]) + .build() + .unwrap(); + let etcd_client = Client::new(etcd_client, runtime).await.unwrap(); + + // Prevent runtime from being dropped in async context at end of test + let etcd_client = std::mem::ManuallyDrop::new(etcd_client); + + // Create RWLock with unique prefix for this test + let test_id = uuid::Uuid::new_v4(); + let lock_prefix = format!("/test/rwlock/{}", test_id); + let rwlock = DistributedRWLock::new(lock_prefix.clone()); + + // Step 1: Acquire first read lock + let _reader1_guard = rwlock + .read_lock_with_wait(&etcd_client, "reader1", Some(Duration::from_secs(5))) + .await + .expect("First read lock should succeed"); + println!("✓ Acquired first read lock"); + + // Step 2: Acquire second read lock (should succeed - multiple readers allowed) + let _reader2_guard = rwlock + .read_lock_with_wait(&etcd_client, "reader2", Some(Duration::from_secs(5))) + .await + .expect("Second read lock should succeed"); + println!("✓ Acquired second read lock"); + + // Step 3: Try to acquire write lock (should fail - readers are active) + let write_result = rwlock.try_write_lock(&etcd_client).await; + assert!( + write_result.is_none(), + "Write lock should fail when readers are active" + ); + println!("✓ Write lock correctly failed with active readers"); + + // Step 4: Drop first read lock + drop(_reader1_guard); + tokio::time::sleep(Duration::from_millis(50)).await; // Give time for async drop + println!("✓ Released first read lock"); + + // Verify write lock still fails with one reader active + let write_result_with_one_reader = rwlock.try_write_lock(&etcd_client).await; + assert!( + write_result_with_one_reader.is_none(), + "Write lock should still fail when one reader is active" + ); + println!("✓ Write lock correctly failed with one reader still active"); + + drop(_reader2_guard); + tokio::time::sleep(Duration::from_millis(50)).await; // Give time for async drop + println!("✓ Released second read lock"); + + // Give etcd a moment to process the deletions + tokio::time::sleep(Duration::from_millis(100)).await; + + // Step 5: Acquire write lock (should succeed now - no locks held) + let _write_guard = rwlock + .try_write_lock(&etcd_client) + .await + .expect("Write lock should succeed with no readers"); + println!("✓ Acquired write lock"); + + // Step 5a: Try to acquire write lock again (should fail immediately - already held) + let write_result_already_held = rwlock.try_write_lock(&etcd_client).await; + assert!( + write_result_already_held.is_none(), + "Write lock should fail when another write lock is already held" + ); + println!("✓ Write lock correctly failed when already held"); + + // Step 6: Spawn background task to acquire read lock + // It should wait because write lock is held + let barrier = Arc::new(Barrier::new(2)); + let barrier_clone = barrier.clone(); + let rwlock_clone = rwlock.clone(); + let etcd_client_clone = etcd_client.clone(); + + let read_task = tokio::spawn(async move { + println!("→ Background: Attempting to acquire read lock (should wait)..."); + barrier_clone.wait().await; // Signal that we've started + + let start = std::time::Instant::now(); + let _guard = rwlock_clone + .read_lock_with_wait(&etcd_client_clone, "reader3", Some(Duration::from_secs(10))) + .await + .expect("Read lock should eventually succeed"); + + let elapsed = start.elapsed(); + println!("✓ Background: Acquired read lock after {:?}", elapsed); + + // Verify it actually waited (should be > 100ms since we sleep before releasing write lock) + assert!( + elapsed > Duration::from_millis(50), + "Read lock should have waited for write lock to be released" + ); + + // Guard will be dropped here, releasing the lock + }); + + // Wait for background task to start + barrier.wait().await; + + // Give the background task a moment to start polling + tokio::time::sleep(Duration::from_millis(200)).await; + + // Step 7: Release write lock by dropping guard + println!("→ Releasing write lock..."); + drop(_write_guard); + tokio::time::sleep(Duration::from_millis(50)).await; // Give time for async drop + println!("✓ Released write lock"); + + // Step 8: Background task should now succeed + read_task + .await + .expect("Background task should complete successfully"); + + // Final cleanup: verify all locks are released + tokio::time::sleep(Duration::from_millis(100)).await; + let remaining_locks = etcd_client + .kv_get_prefix(&format!("v1/{lock_prefix}")) + .await + .expect("Should be able to check remaining locks"); + assert!( + remaining_locks.is_empty(), + "All locks should be released at end of test" + ); + println!("✓ All locks cleaned up successfully"); + + println!("\n🎉 All DistributedRWLock tests passed!"); + } +} From c77b5dda0060513c6fb3e4137115d6a814e4f57e Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 17 Oct 2025 13:01:31 -0700 Subject: [PATCH 18/26] chore: Fix cuda lock in trtllm dockerfile (#3684) (#3704) Signed-off-by: Indrajit Bhosale --- container/Dockerfile.trtllm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm index acd10a36214e..0a7571f1a67a 100644 --- a/container/Dockerfile.trtllm +++ b/container/Dockerfile.trtllm @@ -196,6 +196,9 @@ ARG TENSORRTLLM_INDEX_URL COPY --from=trtllm_wheel /*.whl /trtllm_wheel/ COPY --from=trtllm_wheel /*.txt /trtllm_wheel/ +# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6. +RUN uv pip install "cuda-python>=12,<13" + # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel # because there might be mismatched versions of TensorRT between the NGC PyTorch # and the TRTLLM wheel. From b2053cc2ebf825f1e78714172d86d95a90237628 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Fri, 17 Oct 2025 13:02:45 -0700 Subject: [PATCH 19/26] docs: add gpu details for model recipes #3594 (#3707) --- recipes/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/recipes/README.md b/recipes/README.md index 81125d07c61a..636df5484bd8 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -1,12 +1,12 @@ # Dynamo model serving recipes -| Model family | Backend | Mode | Deployment | Benchmark | -|---------------|---------|---------------------|------------|-----------| -| llama-3-70b | vllm | agg | ✓ | ✓ | -| llama-3-70b | vllm | disagg-multi-node | ✓ | ✓ | -| llama-3-70b | vllm | disagg-single-node | ✓ | ✓ | -| oss-gpt | trtllm | aggregated | ✓ | ✓ | -| DeepSeek-R1 | sglang | disaggregated | ✓ | 🚧 | +| Model family | Backend | Mode | GPU | Deployment | Benchmark | +|---------------|---------|---------------------|-------|------------|-----------| +| llama-3-70b | vllm | agg | H100, H200 | ✓ | ✓ | +| llama-3-70b | vllm | disagg-multi-node | H100, H200 | ✓ | ✓ | +| llama-3-70b | vllm | disagg-single-node | H100, H200 | ✓ | ✓ | +| DeepSeek-R1 | sglang | disaggregated | H200 | ✓ | 🚧 | +| oss-gpt | trtllm | aggregated | GB200 | ✓ | ✓ | ## Prerequisites From 7a22663d6f526c663300d8503a517467563d9a48 Mon Sep 17 00:00:00 2001 From: Neal Vaidya Date: Fri, 17 Oct 2025 13:04:29 -0700 Subject: [PATCH 20/26] docs: Adding elements required for version switcher (#3521) (#3711) Signed-off-by: Andrew Schilling Co-authored-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> --- docs/conf.py | 24 +++++++++++++++++++++- docs/project.json | 1 + docs/versions1.json | 50 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 docs/project.json create mode 100644 docs/versions1.json diff --git a/docs/conf.py b/docs/conf.py index 546b8c3ad069..d95a535b7ccc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,6 +9,7 @@ project = "NVIDIA Dynamo" copyright = "2024-2025, NVIDIA CORPORATION & AFFILIATES" author = "NVIDIA" +release = "latest" # -- General configuration --------------------------------------------------- @@ -58,9 +59,30 @@ # -- Options for HTML output ------------------------------------------------- html_theme = "nvidia_sphinx_theme" html_static_path = ["_static"] +html_extra_path = ["project.json", "versions1.json"] html_theme_options = { "collapse_navigation": False, - "github_url": "https://github.com/ai-dynamo/dynamo", + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/ai-dynamo/dynamo", + "icon": "fa-brands fa-github", + } + ], + "switcher": { + "json_url": "../versions1.json", + "version_match": release, + }, + "extra_head": { + """ + + """ + }, + "extra_footer": { + """ + + """ + }, "navbar_start": ["navbar-logo"], "primary_sidebar_end": [], } diff --git a/docs/project.json b/docs/project.json new file mode 100644 index 000000000000..3b94839f5f63 --- /dev/null +++ b/docs/project.json @@ -0,0 +1 @@ +{"name": "NVIDIA Dynamo", "version": "latest"} \ No newline at end of file diff --git a/docs/versions1.json b/docs/versions1.json new file mode 100644 index 000000000000..d931fc3ae60e --- /dev/null +++ b/docs/versions1.json @@ -0,0 +1,50 @@ +[ + { + "preferred": true, + "version": "latest", + "url": "https://docs.nvidia.com/dynamo/latest/" + }, + { + "version": "0.5.1", + "url": "https://docs.nvidia.com/dynamo/archive/0.5.1/" + }, + { + "version": "0.5.0", + "url": "https://docs.nvidia.com/dynamo/archive/0.5.0/" + }, + { + "name": "0.4.1", + "version": "0.4.1", + "url": "https://docs.nvidia.com/dynamo/archive/0.4.1/" + }, + { + "name": "0.4.0", + "version": "0.4.0", + "url": "https://docs.nvidia.com/dynamo/archive/0.4.0/" + }, + { + "name": "0.3.2", + "version": "0.3.2", + "url": "https://docs.nvidia.com/dynamo/archive/0.3.2/" + }, + { + "name": "0.3.1", + "version": "0.3.1", + "url": "https://docs.nvidia.com/dynamo/archive/0.3.1/" + }, + { + "name": "0.3.0", + "version": "0.3.0", + "url": "https://docs.nvidia.com/dynamo/archive/0.3.0/" + }, + { + "name": "0.2.1", + "version": "0.2.1", + "url": "https://docs.nvidia.com/dynamo/archive/0.2.1/" + }, + { + "name": "0.2.0", + "version": "0.2.0", + "url": "https://docs.nvidia.com/dynamo/archive/0.2.0/" + } +] \ No newline at end of file From e8531f5903400e2b6356bf7c5dd4bf84cabf1b80 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Fri, 17 Oct 2025 16:14:26 -0400 Subject: [PATCH 21/26] ci: OPS-980: Add operator build and push per-commit (#3620) (#3712) Signed-off-by: Dillon Cullinan Signed-off-by: Harrison Saturley-Hall Co-authored-by: Dillon Cullinan --- .../container-validation-backends.yml | 40 ++++++++++++++++++- deploy/cloud/operator/Dockerfile | 2 +- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index 1deef8e5ac1e..1a2a7de1335e 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -30,13 +30,51 @@ jobs: backend-status-check: runs-on: ubuntu-latest - needs: [vllm, sglang, trtllm] + needs: [vllm, sglang, trtllm, operator] if: always() steps: - name: "Check all dependent jobs" run: | echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' + operator: + needs: changed-files + if: needs.changed-files.outputs.has_code_changes == 'true' + strategy: + fail-fast: false + matrix: + platform: + - { arch: amd64, runner: cpu-amd-m5-2xlarge } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge } + name: operator (${{ matrix.platform.arch }}) + runs-on: ${{ matrix.platform.runner }} + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Build Container + id: build-image + shell: bash + run: | + cd deploy/cloud/operator + docker buildx build --load \ + --platform linux/${{ matrix.platform.arch }} \ + -f Dockerfile \ + -t dynamo-operator:latest . + - name: Docker Tag and Push + uses: ./.github/actions/docker-tag-push + with: + local_image: dynamo-operator:latest + push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }} + aws_push: 'false' + azure_push: 'true' + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + vllm: needs: changed-files if: needs.changed-files.outputs.has_code_changes == 'true' diff --git a/deploy/cloud/operator/Dockerfile b/deploy/cloud/operator/Dockerfile index 406a0bff23d4..a666bb53e55a 100644 --- a/deploy/cloud/operator/Dockerfile +++ b/deploy/cloud/operator/Dockerfile @@ -6,7 +6,7 @@ FROM golang:1.24 AS base # Docker buildx automatically provides these ARG TARGETOS=linux -ARG TARGETARCH=amd64 +ARG TARGETARCH RUN echo "Building for ${TARGETOS}/${TARGETARCH}" From 7ae690fb31e7f3a6c21cdd095f2722e71a94d418 Mon Sep 17 00:00:00 2001 From: Hongkuan Zhou Date: Fri, 17 Oct 2025 13:15:09 -0700 Subject: [PATCH 22/26] fix: (cherry-pick) update k8s aic profile job arguments (#3699) (#3706) Signed-off-by: hongkuanz --- benchmarks/profiler/deploy/profile_sla_aic_job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/profiler/deploy/profile_sla_aic_job.yaml b/benchmarks/profiler/deploy/profile_sla_aic_job.yaml index db085839ad20..10b56b309b66 100644 --- a/benchmarks/profiler/deploy/profile_sla_aic_job.yaml +++ b/benchmarks/profiler/deploy/profile_sla_aic_job.yaml @@ -53,7 +53,7 @@ spec: - h200_sxm - --aic-model-name - QWEN3_32B - - --backend-version + - --aic-backend-version - 0.20.0 volumeMounts: - name: output-volume From 34c42319b863f79be60782cf14ab1a17777c4f4c Mon Sep 17 00:00:00 2001 From: William Arnold <7565007+Aphoh@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:35:15 -0700 Subject: [PATCH 23/26] fix: cherry-pick to standardize planner units (#3713) Signed-off-by: William Arnold <7565007+Aphoh@users.noreply.github.com> --- benchmarks/profiler/profile_sla.py | 4 ++-- components/src/dynamo/planner/defaults.py | 4 ++-- .../planner/utils/perf_interpolation.py | 22 +++++++++---------- .../dynamo/planner/utils/planner_argparse.py | 7 ++++-- .../src/dynamo/planner/utils/planner_core.py | 21 ++++++++++++------ docs/benchmarks/pre_deployment_profiling.md | 8 +++---- docs/kubernetes/sla_planner_quickstart.md | 2 +- tests/planner/README.md | 20 ++++++++--------- .../perf_test_configs/disagg_8b_planner.yaml | 4 ++-- tests/planner/scaling/disagg_planner.yaml | 4 ++-- tests/planner/test_replica_calculation.py | 4 ++-- 11 files changed, 54 insertions(+), 46 deletions(-) diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 757c073b2851..e9a3cfa40ba3 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -815,10 +815,10 @@ async def run_profile(args): "--osl", type=int, default=500, help="target output sequence length" ) parser.add_argument( - "--ttft", type=int, default=50, help="target Time To First Token in ms" + "--ttft", type=float, default=50.0, help="target Time To First Token in ms" ) parser.add_argument( - "--itl", type=int, default=10, help="target Inter Token Latency in ms" + "--itl", type=float, default=10.0, help="target Inter Token Latency in ms" ) # arguments used for interpolating TTFT and ITL under different ISL/OSL diff --git a/components/src/dynamo/planner/defaults.py b/components/src/dynamo/planner/defaults.py index 56c63fbbd079..e66337533f1f 100644 --- a/components/src/dynamo/planner/defaults.py +++ b/components/src/dynamo/planner/defaults.py @@ -89,8 +89,8 @@ class SLAPlannerDefaults(BasePlannerDefaults): profile_results_dir = "profiling_results" isl = 3000 # in number of tokens osl = 150 # in number of tokens - ttft = 0.5 # in seconds - itl = 0.05 # in seconds + ttft = 500.0 # in milliseconds + itl = 50.0 # in milliseconds load_predictor = "arima" # ["constant", "arima", "prophet"] load_prediction_window_size = 50 # predict load using how many recent load samples no_correction = False # disable correction factor, might be useful under some conditions like long cold start time diff --git a/components/src/dynamo/planner/utils/perf_interpolation.py b/components/src/dynamo/planner/utils/perf_interpolation.py index 8c5408214764..eccc079f2b72 100644 --- a/components/src/dynamo/planner/utils/perf_interpolation.py +++ b/components/src/dynamo/planner/utils/perf_interpolation.py @@ -51,9 +51,7 @@ def __init__( try: with np.load(prefill_npz_fn) as raw_data: self.prefill_isl = raw_data["prefill_isl"] - self.prefill_ttft = ( - raw_data["prefill_ttft"] / 1000 - ) # convert ms to s + self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"] except FileNotFoundError: logger.error( @@ -64,7 +62,7 @@ def __init__( elif raw_data: self.prefill_isl = raw_data["prefill_isl"] - self.prefill_ttft = raw_data["prefill_ttft"] / 1000 # convert ms to s + self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"] else: raise ValueError("Either profile_results_dir or raw_data must be provided") @@ -150,7 +148,7 @@ def __init__( method="nearest", ) self.itl_interpolator[nan_mask] = itl_nearest[nan_mask] - self.itl_interpolator /= 1000 # convert ms to s + # ITL values are in milliseconds self.thpt_interpolator = scipy.interpolate.griddata( (self.x_kv_usage, self.y_context_length), @@ -230,12 +228,12 @@ def find_best_throughput_per_gpu( parser.add_argument("--profile_results_dir", type=str, required=True) parser.add_argument("--isl", type=int, default=3000) parser.add_argument("--osl", type=int, default=150) - parser.add_argument("--ttft", type=float, default=0.1, help="in s") - parser.add_argument("--itl", type=float, default=0.01, help="in s") + parser.add_argument("--ttft", type=float, default=100.0, help="in milliseconds") + parser.add_argument("--itl", type=float, default=10.0, help="in milliseconds") args = parser.parse_args() print(f"ISL={args.isl}, OSL={args.osl}") - print(f"TTFT={args.ttft}s, ITL={args.itl}s") + print(f"TTFT={args.ttft}ms, ITL={args.itl}ms") print(f"Using profile results from {args.profile_results_dir}") print("") @@ -248,11 +246,11 @@ def find_best_throughput_per_gpu( if est_ttft <= args.ttft: print( - f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA." + f"\tEstimated TTFT={est_ttft:.2f}ms <= target TTFT={args.ttft:.2f}ms. Requests can queue {args.ttft - est_ttft:.2f}ms maximally while meeting TTFT SLA." ) else: print( - f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA." + f"\tEstimated TTFT={est_ttft:.2f}ms > target TTFT={args.ttft:.2f}ms. Cannot meet TTFT SLA." ) print( @@ -274,12 +272,12 @@ def find_best_throughput_per_gpu( ) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length) if est_itl <= args.itl: print( - f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage." + f"\tEstimated ITL={est_itl:.2f}ms <= target ITL={args.itl:.2f}ms at {est_kv_usage*100:.2f}% active kv usage." ) print( f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU." ) else: print( - f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA." + f"\tEstimated ITL={est_itl:.2f}ms > target ITL={args.itl:.2f}ms. Cannot meet ITL SLA." ) diff --git a/components/src/dynamo/planner/utils/planner_argparse.py b/components/src/dynamo/planner/utils/planner_argparse.py index 9832742dc990..4ecda6f07ab4 100644 --- a/components/src/dynamo/planner/utils/planner_argparse.py +++ b/components/src/dynamo/planner/utils/planner_argparse.py @@ -90,10 +90,13 @@ def create_sla_planner_parser() -> argparse.ArgumentParser: "--ttft", type=float, default=SLAPlannerDefaults.ttft, - help="Time to first token", + help="Time to first token (float, in milliseconds)", ) parser.add_argument( - "--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency" + "--itl", + type=float, + default=SLAPlannerDefaults.itl, + help="Inter-token latency (float, in milliseconds)", ) parser.add_argument( "--load-predictor", diff --git a/components/src/dynamo/planner/utils/planner_core.py b/components/src/dynamo/planner/utils/planner_core.py index 00521b01df07..a87251ba8389 100644 --- a/components/src/dynamo/planner/utils/planner_core.py +++ b/components/src/dynamo/planner/utils/planner_core.py @@ -249,13 +249,20 @@ async def observe_metrics(self): self.num_p_workers_gauge.set(len(self.p_endpoints)) self.num_d_workers_gauge.set(len(self.d_endpoints)) - self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token( - f"{self.args.adjustment_interval}s", - self.model_name, + # Prometheus returns seconds, convert to milliseconds + self.last_metrics.ttft = ( + self.prometheus_api_client.get_avg_time_to_first_token( + f"{self.args.adjustment_interval}s", + self.model_name, + ) + * 1000 ) - self.last_metrics.itl = self.prometheus_api_client.get_avg_inter_token_latency( - f"{self.args.adjustment_interval}s", - self.model_name, + self.last_metrics.itl = ( + self.prometheus_api_client.get_avg_inter_token_latency( + f"{self.args.adjustment_interval}s", + self.model_name, + ) + * 1000 ) self.last_metrics.num_req = self.prometheus_api_client.get_avg_request_count( f"{self.args.adjustment_interval}s", @@ -284,7 +291,7 @@ async def observe_metrics(self): f"Observed num_req: {self.last_metrics.num_req:.2f} isl: {self.last_metrics.isl:.2f} osl: {self.last_metrics.osl:.2f}" ) logger.info( - f"Observed ttft: {self.last_metrics.ttft:.3f}s itl: {self.last_metrics.itl:.3f}s" + f"Observed ttft: {self.last_metrics.ttft:.2f}ms itl: {self.last_metrics.itl:.2f}ms" ) self.num_req_predictor.add_data_point(self.last_metrics.num_req) diff --git a/docs/benchmarks/pre_deployment_profiling.md b/docs/benchmarks/pre_deployment_profiling.md index 6160fbb30df2..74ca4df2b39c 100644 --- a/docs/benchmarks/pre_deployment_profiling.md +++ b/docs/benchmarks/pre_deployment_profiling.md @@ -119,9 +119,9 @@ spec: - --osl - "150" # average OSL is 150 tokens - --ttft - - "200" # target TTFT is 200ms + - "200" # target TTFT is 200ms (float, in milliseconds) - --itl - - "20" # target ITL is 20ms + - "20" # target ITL is 20ms (float, in milliseconds) - --backend - ``` @@ -292,8 +292,8 @@ python3 -m benchmarks.profiler.profile_sla \ --aic-backend-version 0.20.0 \ --isl 3000 \ --osl 150 \ - --ttft 0.2 \ - --itl 0.02 + --ttft 200 \ # target TTFT in milliseconds (float) + --itl 20 # target ITL in milliseconds (float) ``` The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment. diff --git a/docs/kubernetes/sla_planner_quickstart.md b/docs/kubernetes/sla_planner_quickstart.md index 8e65be24fa12..b5405523080d 100644 --- a/docs/kubernetes/sla_planner_quickstart.md +++ b/docs/kubernetes/sla_planner_quickstart.md @@ -203,7 +203,7 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10 ``` New adjustment interval started! Observed num_req: X.XXX isl: X.XXX osl: X.XXX -Observed ttft: X.XXXs itl: X.XXXs +Observed ttft: X.XXms itl: X.XXms Number of prefill workers: 1, number of decode workers: 1 ``` diff --git a/tests/planner/README.md b/tests/planner/README.md index 14a7112f715b..f8719cc928d1 100644 --- a/tests/planner/README.md +++ b/tests/planner/README.md @@ -34,34 +34,34 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \ --profile_results_dir \ --isl \ --osl \ - --ttft \ - --itl + --ttft \ + --itl ``` The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine. -For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200, +For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms): ```bash python components/planner/src/dynamo/planner/utils/perf_interpolation.py \ --profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \ --isl 3000 \ --osl 300 \ - --ttft 0.2 \ - --itl 0.01 + --ttft 200 \ + --itl 10 # output: ISL=3000, OSL=300 -TTFT=0.1s, ITL=0.01s +TTFT=200ms, ITL=10ms Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/ Interpolating prefill performance ... - Estimated TTFT=0.060s <= target TTFT=0.200s. Requests can queue 0.140s maximally while meeting TTFT SLA. + Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA. Estimated throughput: 49481.09 tokens/s/gpu. Request rate at 16.49 requests/s will saturate one GPU. Interpolating decode performance ... Average context length: isl + osl/2 = 3150. - Estimated ITL=0.0097s <= target ITL=0.0100s at 16.16% active kv usage. + Estimated ITL=9.70ms <= target ITL=10.00ms at 16.16% active kv usage. Estimated throughput: 4555.68 token/s/gpu. Request rate at 15.19 requests/s will saturate one GPU. ``` @@ -111,8 +111,8 @@ For example, to dry run SLA planner for the previous FP8 8B on H200 using the ge ```bash python components/planner/test/planner_sla_dryrun.py \ - --ttft 0.2 \ - --itl 0.01 \ + --ttft 200 \ + --itl 10 \ --adjustment-interval 60 \ --profile-results-dir tests/planner/profiling_results/H200_TP1P_TP1D/ \ --dataset rr-5-45_i3000o300.jsonl \ diff --git a/tests/planner/perf_test_configs/disagg_8b_planner.yaml b/tests/planner/perf_test_configs/disagg_8b_planner.yaml index ddb052becc25..eb6dcc2e8ba8 100644 --- a/tests/planner/perf_test_configs/disagg_8b_planner.yaml +++ b/tests/planner/perf_test_configs/disagg_8b_planner.yaml @@ -87,8 +87,8 @@ spec: python3 -m planner_sla --environment=kubernetes --backend=vllm - --ttft 0.2 - --itl 0.01 + --ttft 200 + --itl 10 --profile-results-dir /workspace/tests/planner/profiling_results/H200_TP1P_TP1D/ --adjustment-interval=60 --prometheus-port=9085 diff --git a/tests/planner/scaling/disagg_planner.yaml b/tests/planner/scaling/disagg_planner.yaml index 53011ffe1d9a..354858d1877c 100644 --- a/tests/planner/scaling/disagg_planner.yaml +++ b/tests/planner/scaling/disagg_planner.yaml @@ -57,8 +57,8 @@ spec: --adjustment-interval=60 --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D --prometheus-port=9085 - --ttft=0.1 - --itl=0.01 + --ttft=100 + --itl=10 --load-predictor=constant --no-correction VllmDecodeWorker: diff --git a/tests/planner/test_replica_calculation.py b/tests/planner/test_replica_calculation.py index b9effd2cb4fb..1f15f74d4e0f 100644 --- a/tests/planner/test_replica_calculation.py +++ b/tests/planner/test_replica_calculation.py @@ -49,8 +49,8 @@ def planner(): args.decode_engine_num_gpu = 1 args.min_endpoint = 1 args.max_gpu_budget = 10 - args.ttft = 80 # ms - args.itl = 10 # ms + args.ttft = 80.0 # ms + args.itl = 10.0 # ms args.backend = "vllm" args.no_operation = True # Don't actually scale args.no_correction = False # Allow correction factors From a52f59e92e05e321e1457fc0e864c95d7f43b173 Mon Sep 17 00:00:00 2001 From: atchernych Date: Fri, 17 Oct 2025 14:57:26 -0700 Subject: [PATCH 24/26] fix: rename folder (#3718) Signed-off-by: Anna Tchernych --- .../epp-patches/{epp-v0.5.1-2 => v0.5.1-2}/epp-v0.5.1-dyn2.patch | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename deploy/inference-gateway/epp-patches/{epp-v0.5.1-2 => v0.5.1-2}/epp-v0.5.1-dyn2.patch (100%) diff --git a/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch b/deploy/inference-gateway/epp-patches/v0.5.1-2/epp-v0.5.1-dyn2.patch similarity index 100% rename from deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch rename to deploy/inference-gateway/epp-patches/v0.5.1-2/epp-v0.5.1-dyn2.patch From 4ebc72d2225da4bce4f1cf0baedeb5e38fc39f92 Mon Sep 17 00:00:00 2001 From: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:59:21 -0700 Subject: [PATCH 25/26] fix: remove invalid aiperf args (#3710) (#3719) Signed-off-by: Hannah Zhang --- benchmarks/utils/aiperf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/benchmarks/utils/aiperf.py b/benchmarks/utils/aiperf.py index 3e46f6e194ce..2d357ea290f5 100644 --- a/benchmarks/utils/aiperf.py +++ b/benchmarks/utils/aiperf.py @@ -71,9 +71,6 @@ def run_aiperf( model_name, "--artifact-dir", str(output_dir), - "--", - "-vv", - "--max-threads=300", ] print( f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}", From f4864e67362ba53b14b5b8b7a9962320bc641e07 Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Mon, 20 Oct 2025 14:22:33 -0700 Subject: [PATCH 26/26] fix ModelCard code Signed-off-by: Anna Tchernych --- lib/bindings/c/src/lib.rs | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/lib/bindings/c/src/lib.rs b/lib/bindings/c/src/lib.rs index 7f7892e9cd8a..2e3471f9d1e2 100644 --- a/lib/bindings/c/src/lib.rs +++ b/lib/bindings/c/src/lib.rs @@ -10,7 +10,7 @@ use std::sync::atomic::{AtomicU32, Ordering}; use dynamo_llm::kv_router::{ indexer::compute_block_hash_for_seq, protocols::*, publisher::KvEventPublisher, }; -use dynamo_runtime::{DistributedRuntime, Worker, storage::key_value_store::Key}; +use dynamo_runtime::{DistributedRuntime, Worker}; static WK: OnceCell = OnceCell::new(); static DRT: AsyncOnceCell = AsyncOnceCell::new(); // [FIXME] shouldn't the publisher be instance passing between API calls? @@ -332,9 +332,7 @@ use std::{pin::Pin, sync::Arc}; const GENERATE_ENDPOINT: &str = "generate"; use anyhow::Context; -use dynamo_runtime::{ - Runtime, distributed::DistributedConfig, slug::Slug, traits::DistributedRuntimeProvider, -}; +use dynamo_runtime::{Runtime, distributed::DistributedConfig, traits::DistributedRuntimeProvider}; use dynamo_llm::discovery::ModelManager; use dynamo_llm::entrypoint::build_routed_pipeline; @@ -971,22 +969,26 @@ pub async fn create_worker_selection_pipeline_chat( .component(component_name)?; let client = component.endpoint(GENERATE_ENDPOINT).client().await?; - let model_slug = Slug::from_string(model_name); - let card = match ModelDeploymentCard::load_from_store( - &Key::from_raw(model_slug.to_string()), - component.drt(), - ) - .await - { - Ok(Some(card)) => card, - Ok(None) => anyhow::bail!("ModelDeploymentCard not found for model: {}", model_name), - Err(err) => anyhow::bail!( - "Error fetching ModelDeploymentCard from storage under key {model_slug}. {err}" - ), - }; + // Discover the model card by searching all instances with this model name + use dynamo_llm::discovery::ModelWatcher; + let model_manager = std::sync::Arc::new(ModelManager::new()); + let watcher = ModelWatcher::new( + component.drt().clone(), + model_manager.clone(), + router_mode, + kv_router_config.clone(), + busy_threshold, + ); + let cards = watcher + .cards_for_model(model_name, Some(namespace), false) + .await + .with_context(|| format!("Failed to discover model: {}", model_name))?; + + let card = cards.into_iter().next().ok_or_else(|| { + anyhow::anyhow!("ModelDeploymentCard not found for model: {}", model_name) + })?; let chooser = if router_mode == RouterMode::KV { - let model_manager = std::sync::Arc::new(ModelManager::new()); Some( model_manager .kv_chooser_for(