Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
211bbab
fix: circular rust dynamo-parsers, dynamo-llm dependency (#3607) (#3…
saturley-hall Oct 14, 2025
0b6ef5f
chore: update the relevant my-registry and my-tag (#3611)
saturley-hall Oct 14, 2025
a25268d
chore: typo and new commands (#3617) (#3625)
saturley-hall Oct 14, 2025
a61a800
feat: cherry pick PR#3306 benchmarks use aiperf (#3626)
saturley-hall Oct 14, 2025
c4b41fd
feat: add pre-deployment check for storageclass (#3573) (#3608)
biswapanda Oct 14, 2025
165276f
chore: update sglang container and version (#3647)
ishandhanani Oct 15, 2025
ec47178
fix: cherrypick cuda 129 (#3652)
alec-flowers Oct 15, 2025
1ef8cc1
fix: update model recipe for llama-3 70b to match with common recipe …
biswapanda Oct 16, 2025
bf73dde
fix: copy commit info in trtllm build (#3619) (#3670)
nv-anants Oct 16, 2025
048ebd8
fix: update invalid AIPerf scripts and parsing logic (#3681)
ajcasagrande Oct 16, 2025
cbe523f
fix: aiconfigurator breaking tests due to not being installed correct…
saturley-hall Oct 16, 2025
6ef659c
feat: Replace genai-perf with aiperf in components/backends (#3528) (…
saturley-hall Oct 16, 2025
b28b8bb
fix: Cherry-pick in last of aiperf replacements (#3683)
saturley-hall Oct 16, 2025
c55f34a
fix: Reduce memory usage to avoid vLLM dsr1 OOM (#3660) (#3661)
krishung5 Oct 16, 2025
b08e97b
fix: cherry pick sglang bump + fix k8s yamls (#3708)
ishandhanani Oct 17, 2025
249c21a
fix: json strings should remain intact through profiler arg processin…
hhzhang16 Oct 17, 2025
8bc9f2f
feat: (cherrypick) custom distributed rw lock for radix snapshotting …
PeaBrane Oct 17, 2025
c77b5dd
chore: Fix cuda lock in trtllm dockerfile (#3684) (#3704)
indrajit96 Oct 17, 2025
b2053cc
docs: add gpu details for model recipes #3594 (#3707)
biswapanda Oct 17, 2025
7a22663
docs: Adding elements required for version switcher (#3521) (#3711)
nealvaidya Oct 17, 2025
e8531f5
ci: OPS-980: Add operator build and push per-commit (#3620) (#3712)
saturley-hall Oct 17, 2025
7ae690f
fix: (cherry-pick) update k8s aic profile job arguments (#3699) (#3706)
tedzhouhk Oct 17, 2025
34c4231
fix: cherry-pick to standardize planner units (#3713)
Aphoh Oct 17, 2025
a52f59e
fix: rename folder (#3718)
atchernych Oct 17, 2025
4ebc72d
fix: remove invalid aiperf args (#3710) (#3719)
hhzhang16 Oct 17, 2025
f4864e6
fix ModelCard code
atchernych Oct 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion .github/actions/docker-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@ inputs:
aws_secret_access_key:
description: 'AWS Secret Access Key'
required: false
base_image_tag:
description: 'Optional override for base image tag passed to build.sh'
required: false
runtime_image_tag:
description: 'Optional override for RUNTIME_IMAGE_TAG build-arg'
required: false
cuda_version:
description: 'Optional override for CUDA_VERSION build-arg'
required: false
torch_backend:
description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
required: false

outputs:
image_tag:
Expand Down Expand Up @@ -81,14 +93,29 @@ runs:
echo "BUILD_START_TIME=${BUILD_START_TIME}" >> $GITHUB_ENV

echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
# Collect optional overrides provided by the workflow
EXTRA_ARGS=""
if [ -n "${{ inputs.base_image_tag }}" ]; then
EXTRA_ARGS+=" --base-image-tag ${{ inputs.base_image_tag }}"
fi
if [ -n "${{ inputs.runtime_image_tag }}" ]; then
EXTRA_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=${{ inputs.runtime_image_tag }}"
fi
if [ -n "${{ inputs.cuda_version }}" ]; then
EXTRA_ARGS+=" --build-arg CUDA_VERSION=${{ inputs.cuda_version }}"
fi
if [ -n "${{ inputs.torch_backend }}" ]; then
EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}"
fi

./container/build.sh --tag "$IMAGE_TAG" \
--target ${{ inputs.target }} \
--vllm-max-jobs 10 \
--framework ${{ inputs.framework }} \
--platform ${{ inputs.platform }} \
--use-sccache \
--sccache-bucket "$SCCACHE_S3_BUCKET" \
--sccache-region "$AWS_DEFAULT_REGION"
--sccache-region "$AWS_DEFAULT_REGION" $EXTRA_ARGS

BUILD_END_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)
echo "🕐 Build ended at: ${BUILD_END_TIME}"
Expand Down
46 changes: 44 additions & 2 deletions .github/workflows/container-validation-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,51 @@ jobs:

backend-status-check:
runs-on: ubuntu-latest
needs: [vllm, sglang, trtllm]
needs: [vllm, sglang, trtllm, operator]
if: always()
steps:
- name: "Check all dependent jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'

operator:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: cpu-amd-m5-2xlarge }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: operator (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build Container
id: build-image
shell: bash
run: |
cd deploy/cloud/operator
docker buildx build --load \
--platform linux/${{ matrix.platform.arch }} \
-f Dockerfile \
-t dynamo-operator:latest .
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: dynamo-operator:latest
push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

vllm:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
Expand All @@ -58,6 +96,10 @@ jobs:
framework: vllm
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
Expand Down Expand Up @@ -251,4 +293,4 @@ jobs:
CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }}
run: |
# Upload complete workflow metrics including container metrics
python3 .github/workflows/upload_complete_workflow_metrics.py
python3 .github/workflows/upload_complete_workflow_metrics.py
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions Earthfile
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ dynamo-build:

dynamo-base-docker:
ARG IMAGE=dynamo-base-docker
ARG DOCKER_SERVER=my-registry
ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
ARG IMAGE_TAG=latest

FROM ubuntu:24.04
Expand All @@ -159,7 +159,7 @@ dynamo-base-docker:
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"

RUN uv pip install -r /tmp/requirements.txt
RUN UV_GIT_LFS=1 uv pip install -r /tmp/requirements.txt

# Copy and install wheels -- ai-dynamo-runtime first, then ai-dynamo
COPY +dynamo-build/ai_dynamo_runtime*.whl /tmp/wheels/
Expand All @@ -175,7 +175,7 @@ all-test:
BUILD ./deploy/cloud/operator+test

all-docker:
ARG DOCKER_SERVER=my-registry
ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
ARG IMAGE_TAG=latest
BUILD ./deploy/cloud/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG

Expand All @@ -189,6 +189,6 @@ all:

# For testing
custom:
ARG DOCKER_SERVER=my-registry
ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
ARG IMAGE_TAG=latest
BUILD +all-test
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res

Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments:

- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using GenAI-Perf
- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf
- **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements

# Engines
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# Benchmarks

This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around genai-perf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.
This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around aiperf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.

## Quick Start

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/incluster/benchmark_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
containers:
- name: benchmark-runner
# TODO: update to latest public image in next release
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
7 changes: 3 additions & 4 deletions benchmarks/llm/perf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ if [ $index -gt 0 ]; then
echo "--------------------------------"
fi

echo "Running genai-perf with:"
echo "Running aiperf with:"
echo "Model: $model"
echo "ISL: $isl"
echo "OSL: $osl"
Expand All @@ -214,7 +214,7 @@ for concurrency in "${concurrency_array[@]}"; do

# NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
# `ignore_eos` since they are not in the official OpenAI spec.
genai-perf profile \
aiperf profile \
--model ${model} \
--tokenizer ${model} \
--endpoint-type chat \
Expand All @@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do
--num-dataset-entries $(($concurrency*12)) \
--random-seed 100 \
--artifact-dir ${artifact_dir} \
-- \
--ui simple \
-v \
--max-threads ${concurrency} \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'

Expand Down
36 changes: 17 additions & 19 deletions benchmarks/llm/plot_pareto.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,21 @@


def get_json_paths(search_paths):
genai_perf_profile_export_json_paths = []
aiperf_profile_export_json_paths = []
deployment_config_json_paths = []
for search_path in search_paths:
deployment_config_json_path = os.path.join(
search_path, "deployment_config.json"
)
if not os.path.exists(deployment_config_json_path):
raise Exception(f"deployment_config.json not found in {search_path}")
for root, dirs, files in os.walk(search_path):
for root, _, files in os.walk(search_path):
for file in files:
if file == "profile_export_genai_perf.json":
genai_perf_profile_export_json_paths.append(
os.path.join(root, file)
)
if file == "profile_export_aiperf.json":
aiperf_profile_export_json_paths.append(os.path.join(root, file))
deployment_config_json_paths.append(deployment_config_json_path)

return genai_perf_profile_export_json_paths, deployment_config_json_paths
return aiperf_profile_export_json_paths, deployment_config_json_paths


# search for -concurrency<number> in the name
Expand Down Expand Up @@ -81,13 +79,13 @@ def parse_kind_and_mode(deployment_config_json_path):


def extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
aiperf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
):
results = []
for genai_perf_profile_export_json_path, deployment_config_json_path in zip(
genai_perf_profile_export_json_paths, deployment_config_json_paths
for aiperf_profile_export_json_path, deployment_config_json_path in zip(
aiperf_profile_export_json_paths, deployment_config_json_paths
):
with open(genai_perf_profile_export_json_path, "r") as f:
with open(aiperf_profile_export_json_path, "r") as f:
data = json.load(f)
# output_token_throughput contains only avg
output_token_throughput = data.get("output_token_throughput", {}).get("avg")
Expand All @@ -99,7 +97,7 @@ def extract_val_and_concurrency(
# request_throughput contains only avg
request_throughput = data.get("request_throughput", {}).get("avg")

concurrency = parse_concurrency(genai_perf_profile_export_json_path)
concurrency = parse_concurrency(aiperf_profile_export_json_path)
num_gpus = parse_gpus(deployment_config_json_path)
kind, mode = parse_kind_and_mode(deployment_config_json_path)

Expand All @@ -116,7 +114,7 @@ def extract_val_and_concurrency(

results.append(
{
"configuration": genai_perf_profile_export_json_path,
"configuration": aiperf_profile_export_json_path,
"kind": kind,
"mode": mode,
"num_gpus": num_gpus,
Expand Down Expand Up @@ -241,12 +239,12 @@ def pareto_efficient(ids, points):
import os

parser = argparse.ArgumentParser(
description="Plot Pareto graph from GenAI-Perf artifacts"
description="Plot Pareto graph from AIPerf artifacts"
)
parser.add_argument(
"--artifacts-root-dir",
required=True,
help="Root directory containing artifact directories to search for profile_export_genai_perf.json files",
help="Root directory containing artifact directories to search for profile_export_aiperf.json files",
)
parser.add_argument(
"--title",
Expand All @@ -260,16 +258,16 @@ def pareto_efficient(ids, points):
if not artifacts_dirs:
raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}")

genai_perf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
aiperf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
artifacts_dirs
)

if len(genai_perf_profile_export_json_paths) != len(deployment_config_json_paths):
if len(aiperf_profile_export_json_paths) != len(deployment_config_json_paths):
raise ValueError(
f"Number of genai_perf_profile_export_json_paths ({len(genai_perf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
f"Number of aiperf_profile_export_json_paths ({len(aiperf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
)

extracted_values = extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths
aiperf_profile_export_json_paths, deployment_config_json_paths
)
create_pareto_graph(extracted_values, title=args.title)
32 changes: 0 additions & 32 deletions benchmarks/nixl/README.md

This file was deleted.

2 changes: 1 addition & 1 deletion benchmarks/profiler/deploy/profile_sla_aic_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ spec:
- h200_sxm
- --aic-model-name
- QWEN3_32B
- --backend-version
- --aic-backend-version
- 0.20.0
volumeMounts:
- name: output-volume
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/profiler/profile_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import logging
import os

from utils.profile_decode import profile_decode
from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
from benchmarks.profiler.utils.profile_prefill import profile_prefill

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand Down
Loading
Loading