ai-dynamo · atchernych · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
@@ -34,6 +34,18 @@ inputs:
   aws_secret_access_key:
     description: 'AWS Secret Access Key'
     required: false
+  base_image_tag:
+    description: 'Optional override for base image tag passed to build.sh'
+    required: false
+  runtime_image_tag:
+    description: 'Optional override for RUNTIME_IMAGE_TAG build-arg'
+    required: false
+  cuda_version:
+    description: 'Optional override for CUDA_VERSION build-arg'
+    required: false
+  torch_backend:
+    description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
+    required: false
 
 outputs:
   image_tag:
@@ -81,14 +93,29 @@ runs:
         echo "BUILD_START_TIME=${BUILD_START_TIME}" >> $GITHUB_ENV
 
         echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
+        # Collect optional overrides provided by the workflow
+        EXTRA_ARGS=""
+        if [ -n "${{ inputs.base_image_tag }}" ]; then
+          EXTRA_ARGS+=" --base-image-tag ${{ inputs.base_image_tag }}"
+        fi
+        if [ -n "${{ inputs.runtime_image_tag }}" ]; then
+          EXTRA_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=${{ inputs.runtime_image_tag }}"
+        fi
+        if [ -n "${{ inputs.cuda_version }}" ]; then
+          EXTRA_ARGS+=" --build-arg CUDA_VERSION=${{ inputs.cuda_version }}"
+        fi
+        if [ -n "${{ inputs.torch_backend }}" ]; then
+          EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}"
+        fi
+
         ./container/build.sh --tag "$IMAGE_TAG" \
           --target ${{ inputs.target }} \
           --vllm-max-jobs 10 \
           --framework ${{ inputs.framework }} \
           --platform ${{ inputs.platform }} \
           --use-sccache \
           --sccache-bucket "$SCCACHE_S3_BUCKET" \
-          --sccache-region "$AWS_DEFAULT_REGION"
+          --sccache-region "$AWS_DEFAULT_REGION" $EXTRA_ARGS
 
         BUILD_END_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)
         echo "🕐 Build ended at: ${BUILD_END_TIME}"

@@ -30,13 +30,51 @@ jobs:
 
   backend-status-check:
     runs-on: ubuntu-latest
-    needs: [vllm, sglang, trtllm]
+    needs: [vllm, sglang, trtllm, operator]
     if: always()
     steps:
       - name: "Check all dependent jobs"
         run: |
           echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
 
+  operator:
+    needs: changed-files
+    if: needs.changed-files.outputs.has_code_changes == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - { arch: amd64, runner: cpu-amd-m5-2xlarge }
+          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
+    name: operator (${{ matrix.platform.arch }})
+    runs-on: ${{ matrix.platform.runner }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build Container
+        id: build-image
+        shell: bash
+        run: |
+          cd deploy/cloud/operator
+          docker buildx build --load \
+              --platform linux/${{ matrix.platform.arch }} \
+              -f Dockerfile \
+              -t dynamo-operator:latest .
+      - name: Docker Tag and Push
+        uses: ./.github/actions/docker-tag-push
+        with:
+          local_image: dynamo-operator:latest
+          push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
+          aws_push: 'false'
+          azure_push: 'true'
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+
   vllm:
     needs: changed-files
     if: needs.changed-files.outputs.has_code_changes == 'true'
@@ -58,6 +96,10 @@ jobs:
           framework: vllm
           target: runtime
           platform: 'linux/${{ matrix.platform.arch }}'
+          base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
+          runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
+          cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
+          torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
           ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
           ci_token: ${{ secrets.CI_TOKEN }}
           aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
@@ -251,4 +293,4 @@ jobs:
           CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }}
         run: |
           # Upload complete workflow metrics including container metrics
-          python3 .github/workflows/upload_complete_workflow_metrics.py
+          python3 .github/workflows/upload_complete_workflow_metrics.py
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Earthfile b/Earthfile
@@ -134,7 +134,7 @@ dynamo-build:
 
 dynamo-base-docker:
     ARG IMAGE=dynamo-base-docker
-    ARG DOCKER_SERVER=my-registry
+    ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
     ARG IMAGE_TAG=latest
 
     FROM ubuntu:24.04
@@ -159,7 +159,7 @@ dynamo-base-docker:
     ENV VIRTUAL_ENV=/opt/dynamo/venv
     ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
 
-    RUN uv pip install -r /tmp/requirements.txt
+    RUN UV_GIT_LFS=1 uv pip install -r /tmp/requirements.txt
 
     # Copy and install wheels -- ai-dynamo-runtime first, then ai-dynamo
     COPY +dynamo-build/ai_dynamo_runtime*.whl /tmp/wheels/
@@ -175,7 +175,7 @@ all-test:
     BUILD ./deploy/cloud/operator+test
 
 all-docker:
-    ARG DOCKER_SERVER=my-registry
+    ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
     ARG IMAGE_TAG=latest
     BUILD ./deploy/cloud/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG
 
@@ -189,6 +189,6 @@ all:
 
 # For testing
 custom:
-    ARG DOCKER_SERVER=my-registry
+    ARG DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo
     ARG IMAGE_TAG=latest
     BUILD +all-test
diff --git a/README.md b/README.md
@@ -178,7 +178,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res
 
 Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments:
 
-- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using GenAI-Perf
+- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf
 - **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements
 
 # Engines

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -15,7 +15,7 @@
 
 # Benchmarks
 
-This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around genai-perf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.
+This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around aiperf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.
 
 ## Quick Start
 

diff --git a/benchmarks/incluster/benchmark_job.yaml b/benchmarks/incluster/benchmark_job.yaml
@@ -18,7 +18,7 @@ spec:
       containers:
       - name: benchmark-runner
         # TODO: update to latest public image in next release
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0
         securityContext:
           allowPrivilegeEscalation: false
           capabilities:

diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh
@@ -202,7 +202,7 @@ if [ $index -gt 0 ]; then
     echo "--------------------------------"
 fi
 
-echo "Running genai-perf with:"
+echo "Running aiperf with:"
 echo "Model: $model"
 echo "ISL: $isl"
 echo "OSL: $osl"
@@ -214,7 +214,7 @@ for concurrency in "${concurrency_array[@]}"; do
 
   # NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
   # `ignore_eos` since they are not in the official OpenAI spec.
-  genai-perf profile \
+  aiperf profile \
     --model ${model} \
     --tokenizer ${model} \
     --endpoint-type chat \
@@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do
     --num-dataset-entries $(($concurrency*12)) \
     --random-seed 100 \
     --artifact-dir ${artifact_dir} \
-    -- \
+    --ui simple \
     -v \
-    --max-threads ${concurrency} \
     -H 'Authorization: Bearer NOT USED' \
     -H 'Accept: text/event-stream'
 

@@ -26,23 +26,21 @@
 
 
 def get_json_paths(search_paths):
-    genai_perf_profile_export_json_paths = []
+    aiperf_profile_export_json_paths = []
     deployment_config_json_paths = []
     for search_path in search_paths:
         deployment_config_json_path = os.path.join(
             search_path, "deployment_config.json"
         )
         if not os.path.exists(deployment_config_json_path):
             raise Exception(f"deployment_config.json not found in {search_path}")
-        for root, dirs, files in os.walk(search_path):
+        for root, _, files in os.walk(search_path):
             for file in files:
-                if file == "profile_export_genai_perf.json":
-                    genai_perf_profile_export_json_paths.append(
-                        os.path.join(root, file)
-                    )
+                if file == "profile_export_aiperf.json":
+                    aiperf_profile_export_json_paths.append(os.path.join(root, file))
                     deployment_config_json_paths.append(deployment_config_json_path)
 
-    return genai_perf_profile_export_json_paths, deployment_config_json_paths
+    return aiperf_profile_export_json_paths, deployment_config_json_paths
 
 
 # search for -concurrency<number> in the name
@@ -81,13 +79,13 @@ def parse_kind_and_mode(deployment_config_json_path):
 
 
 def extract_val_and_concurrency(
-    genai_perf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
+    aiperf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
 ):
     results = []
-    for genai_perf_profile_export_json_path, deployment_config_json_path in zip(
-        genai_perf_profile_export_json_paths, deployment_config_json_paths
+    for aiperf_profile_export_json_path, deployment_config_json_path in zip(
+        aiperf_profile_export_json_paths, deployment_config_json_paths
     ):
-        with open(genai_perf_profile_export_json_path, "r") as f:
+        with open(aiperf_profile_export_json_path, "r") as f:
             data = json.load(f)
             # output_token_throughput contains only avg
             output_token_throughput = data.get("output_token_throughput", {}).get("avg")
@@ -99,7 +97,7 @@ def extract_val_and_concurrency(
             # request_throughput contains only avg
             request_throughput = data.get("request_throughput", {}).get("avg")
 
-        concurrency = parse_concurrency(genai_perf_profile_export_json_path)
+        concurrency = parse_concurrency(aiperf_profile_export_json_path)
         num_gpus = parse_gpus(deployment_config_json_path)
         kind, mode = parse_kind_and_mode(deployment_config_json_path)
 
@@ -116,7 +114,7 @@ def extract_val_and_concurrency(
 
         results.append(
             {
-                "configuration": genai_perf_profile_export_json_path,
+                "configuration": aiperf_profile_export_json_path,
                 "kind": kind,
                 "mode": mode,
                 "num_gpus": num_gpus,
@@ -241,12 +239,12 @@ def pareto_efficient(ids, points):
     import os
 
     parser = argparse.ArgumentParser(
-        description="Plot Pareto graph from GenAI-Perf artifacts"
+        description="Plot Pareto graph from AIPerf artifacts"
     )
     parser.add_argument(
         "--artifacts-root-dir",
         required=True,
-        help="Root directory containing artifact directories to search for profile_export_genai_perf.json files",
+        help="Root directory containing artifact directories to search for profile_export_aiperf.json files",
     )
     parser.add_argument(
         "--title",
@@ -260,16 +258,16 @@ def pareto_efficient(ids, points):
     if not artifacts_dirs:
         raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}")
 
-    genai_perf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
+    aiperf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
         artifacts_dirs
     )
 
-    if len(genai_perf_profile_export_json_paths) != len(deployment_config_json_paths):
+    if len(aiperf_profile_export_json_paths) != len(deployment_config_json_paths):
         raise ValueError(
-            f"Number of genai_perf_profile_export_json_paths ({len(genai_perf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
+            f"Number of aiperf_profile_export_json_paths ({len(aiperf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
         )
 
     extracted_values = extract_val_and_concurrency(
-        genai_perf_profile_export_json_paths, deployment_config_json_paths
+        aiperf_profile_export_json_paths, deployment_config_json_paths
     )
     create_pareto_graph(extracted_values, title=args.title)
diff --git a/benchmarks/nixl/README.md b/benchmarks/nixl/README.md
diff --git a/benchmarks/profiler/deploy/profile_sla_aic_job.yaml b/benchmarks/profiler/deploy/profile_sla_aic_job.yaml
@@ -53,7 +53,7 @@ spec:
           - h200_sxm
           - --aic-model-name
           - QWEN3_32B
-          - --backend-version
+          - --aic-backend-version
           - 0.20.0
         volumeMounts:
           - name: output-volume

@@ -5,8 +5,8 @@
 import logging
 import os
 
-from utils.profile_decode import profile_decode
-from utils.profile_prefill import profile_prefill
+from benchmarks.profiler.utils.profile_decode import profile_decode
+from benchmarks.profiler.utils.profile_prefill import profile_prefill
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)