From 15ac82a494dbcf2a058911e8bbf2aff2790a5173 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 24 Sep 2025 15:58:02 -0700
Subject: [PATCH 01/14] add model cache and use public aiperf image

Signed-off-by: Biswa Panda <biswa.panda@gmail.com>
---
 .../gpt-oss-120b/model-cache/model-cache.yaml |  13 +++
 .../model-cache/model-download.yaml           |  47 ++++++++
 recipes/gpt-oss-120b/trtllm/agg/bench.yaml    |  85 ++++++++------
 recipes/gpt-oss-120b/trtllm/agg/deploy.yaml   | 108 ++++++++++++------
 4 files changed, 186 insertions(+), 67 deletions(-)
 create mode 100644 recipes/gpt-oss-120b/model-cache/model-cache.yaml
 create mode 100644 recipes/gpt-oss-120b/model-cache/model-download.yaml

diff --git a/recipes/gpt-oss-120b/model-cache/model-cache.yaml b/recipes/gpt-oss-120b/model-cache/model-cache.yaml
new file mode 100644
index 000000000000..790c46955e75
--- /dev/null
+++ b/recipes/gpt-oss-120b/model-cache/model-cache.yaml
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: model-cache
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 100Gi
+  storageClassName: "your-storage-class-name"
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml
new file mode 100644
index 000000000000..3471788195c7
--- /dev/null
+++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: model-download
+spec:
+  backoffLimit: 3
+  completions: 1
+  parallelism: 1
+  template:
+    metadata:
+      labels:
+        app: model-download
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: model-download
+          image: python:3.10-slim
+          command: ["sh", "-c"]
+          envFrom:
+            - secretRef:
+                name: hf-token-secret
+          env:
+            - name: MODEL_NAME
+              value: openai/gpt-oss-120b
+            - name: HF_HOME
+              value: /model-store
+            - name: HF_HUB_ENABLE_HF_TRANSFER
+              value: "1"
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: HF_TOKEN
+          args:
+            - |
+              set -eux
+              pip install --no-cache-dir huggingface_hub hf_transfer
+              hf download $MODEL_NAME --exclude "original/*" --exclude "metal/*"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /model-store
+      volumes:
+      - name: model-cache
+        persistentVolumeClaim:
+          claimName: model-cache
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml
index 6d1a3c422c96..7eb9da4158fc 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml
@@ -11,40 +11,27 @@ spec:
   template:
     metadata:
       labels:
-        app: oss-gpt120b
+        app: oss-gpt120b-bench
     spec:
-      restartPolicy: Never
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                  - key: nvidia.com/dynamo-graph-deployment-name
+                    operator: In
+                    values:
+                      - gpt-oss-agg
+              topologyKey: kubernetes.io/hostname
       containers:
-      - name: perf
-        image: my-registry/vllm-runtime:my-tag
-        workingDir: /workspace/components/backends/vllm
-        env:
-          - name: TARGET_MODEL
-            value: openai/gpt-oss-120b
-          - name: ENDPOINT
-            value: gpt-oss-agg-trtllmworker:8000
-          - name: CONCURRENCIES
-            value: "13000 13500 1400"
-          - name: ISL
-            value: "16"
-          - name: OSL
-            value: "1000"
-          - name: DEPLOYMENT_MODE
-            value: "agg"
-          - name: DEPLOYMENT_GPU_COUNT
-            value: "32"
-          - name: JOB_NAME
-            valueFrom:
-              fieldRef:
-                fieldPath: metadata.labels['job-name']
-          - name: ROOT_ARTIFACT_DIR
-            value: /root/.cache/huggingface/hub/perf
-        command:
+      - command:
         - /bin/sh
         - -c
         - |
           #TODO: this can be baked into the aiperf image
-          apt-get update && apt-get install -y curl jq
+          apt-get update && apt-get install -y curl jq procps
+          sysctl -w net.ipv4.ip_local_port_range="1024 65000"
+          cat /proc/sys/net/ipv4/ip_local_port_range
           export COLUMNS=200
           EPOCH=$(date +%s)
           ## utility functions -- can be moved to a bash script / configmap
@@ -81,12 +68,12 @@ spec:
                 --extra-inputs "{\"ignore_eos\":true}" \
                 --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
                 --concurrency $concurrency \
-                --request-count $((3*concurrency)) \
+                --request-count $((2*concurrency)) \
                 --warmup-request-count $concurrency \
                 --conversation-num 1 \
                 --random-seed 100 \
                 --request-rate 100000 \
-                --workers-max 128 \
+                --workers-max 252 \
                 -H 'Authorization: Bearer NOT USED' \
                 -H 'Accept: text/event-stream'\
                 --record-processors 32 \
@@ -113,11 +100,45 @@ spec:
             run_perf $concurrency $ISL $OSL
             sleep 10
           done
+        env:
+        - name: TARGET_MODEL
+          value: openai/gpt-oss-120b
+        - name: ENDPOINT
+          value: gpt-oss-agg-frontend:8000
+        - name: CONCURRENCIES
+          value: "130000"
+        - name: ISL
+          value: "16"
+        - name: OSL
+          value: "1000"
+        - name: DEPLOYMENT_MODE
+          value: agg
+        - name: DEPLOYMENT_GPU_COUNT
+          value: "72"
+        - name: AIPERF_HTTP_CONNECTION_LIMIT
+          value: "252"
+        - name: JOB_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['job-name']
+        - name: ROOT_ARTIFACT_DIR
+          value: /root/.cache/huggingface/hub/perf
+        image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:aiperf-lim-2-31b447d6
+        imagePullPolicy: IfNotPresent
+        name: perf
+        resources: {}
+        securityContext:
+          privileged: true
         volumeMounts:
         - name: model-cache
-          mountPath: /root/.cache/huggingface
+          mountPath: /root/.cache/huggingface/hub
+        workingDir: /workspace/components/backends/vllm
+      dnsPolicy: ClusterFirst
       imagePullSecrets:
-        - name: nvcrimagepullsecret
+      - name: nvcrimagepullsecret
+      restartPolicy: Never
+      schedulerName: default-scheduler
       volumes:
       - name: model-cache
         persistentVolumeClaim:
diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
index 53080c4288fa..6bfde411ee35 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -3,64 +3,102 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: gpt-oss-agg-shm
+  name: gpt-oss-agg
 spec:
   backendFramework: trtllm
   services:
+    Frontend:
+      componentType: frontend
+      dynamoNamespace: gpt-oss-agg
+      extraPodSpec:
+        affinity:
+          podAntiAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                - key: nvidia.com/dynamo-graph-deployment-name
+                  operator: In
+                  values:
+                  - gpt-oss-agg-frontend
+              topologyKey: kubernetes.io/hostname
+        mainContainer:
+          args:
+          - python3 -m dynamo.frontend --router-mode round-robin --http-port 8000
+          command:
+          - /bin/sh
+          - -c
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:gpt-oss-dynamo-nvl72-debug-trtllm-tot
+      pvc:
+        create: false
+        mountPoint: /model-store
+        name: model-cache
+      replicas: 18
     TrtllmWorker:
       componentType: main
-      dynamoNamespace: gpt-oss-agg-shm
+      dynamoNamespace: gpt-oss-agg
       envFromSecret: hf-token-secret
-      pvc:
-        create: false
-        name: model-cache-oss-gpt120b
-        mountPoint: /root/.cache/huggingface
-      sharedMemory:
-        size: 80Gi
       extraPodSpec:
-        tolerations:
-          - key: "dedicated"
-            operator: "Equal"
-            value: "user-workload"
-            effect: "NoSchedule"
-          - key: "dedicated"
-            operator: "Equal"
-            value: "user-workload"
-            effect: "NoExecute"
         affinity:
           nodeAffinity:
             requiredDuringSchedulingIgnoredDuringExecution:
               nodeSelectorTerms:
-                - matchExpressions:
-                    - key: nvidia.com/gpu.present
-                      operator: In
-                      values:
-                        - "true"
+              - matchExpressions:
+                - key: nvidia.com/gpu.present
+                  operator: In
+                  values:
+                  - "true"
+          podAntiAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                - key: nvidia.com/dynamo-graph-deployment-name
+                  operator: In
+                  values:
+                  - gpt-oss-agg
+              topologyKey: kubernetes.io/hostname
         mainContainer:
           args:
           - |
-            export TRTLLM_ENABLE_PDL=1
-            export TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True
-            export ENGINE_ARGS=${AGG_ENGINE_ARGS:-"/root/.cache/huggingface/gpt-oss-120b/config.yaml"}
-            export MODEL_PATH=${MODEL_PATH:-"/root/.cache/huggingface/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"}
-            export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
-            trap 'echo Cleaning up...; kill 0' EXIT
-            python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
             python3 -m dynamo.trtllm \
-              --model-path "$MODEL_PATH" \
-              --served-model-name "$SERVED_MODEL_NAME" \
-              --extra-engine-args "$ENGINE_ARGS" \
+              --model-path "/model-store/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a" \
+              --served-model-name "openai/gpt-oss-120b" \
+              --extra-engine-args "${ENGINE_ARGS}" \
               --max-num-tokens 20000 \
               --max-batch-size 640 \
               --free-gpu-memory-fraction 0.9
           command:
           - /bin/sh
           - -c
-          image: my-registry/vllm-runtime:my-tag
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:gpt-oss-dynamo-nvl72-debug-trtllm-tot
+          env:
+          - name: TRTLLM_ENABLE_PDL
+            value: "1"
+          - name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL
+            value: "True"
+          - name: SERVED_MODEL_NAME
+            value: "openai/gpt-oss-120b"
+          - name: ENGINE_ARGS
+            value: "/opt/dynamo/configs/config.yaml"
+          - name: MODEL_PATH
+            value: "/model-store/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"
+          volumeMounts:
+          - mountPath: /opt/dynamo/configs
+            name: llm-config
+            readOnly: true
           workingDir: /workspace/components/backends/trtllm
-      replicas: 1
+        volumes:
+        - configMap:
+            name: llm-config
+          name: llm-config
+      pvc:
+        create: false
+        mountPoint: /model-store
+        name: model-cache
+      replicas: 18
       resources:
         limits:
           gpu: "4"
         requests:
-          gpu: "4"
\ No newline at end of file
+          gpu: "4"
+      sharedMemory:
+        size: 80Gi
\ No newline at end of file

From 61e60db2bec24b5cc17a399080f8c466ba298fc8 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 24 Sep 2025 17:06:50 -0700
Subject: [PATCH 02/14] fix

Signed-off-by: Biswa Panda <biswa.panda@gmail.com>
---
 recipes/gpt-oss-120b/model-cache/model-download.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml
index 3471788195c7..df898a792424 100644
--- a/recipes/gpt-oss-120b/model-cache/model-download.yaml
+++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml
@@ -28,11 +28,6 @@ spec:
               value: /model-store
             - name: HF_HUB_ENABLE_HF_TRANSFER
               value: "1"
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: HF_TOKEN
           args:
             - |
               set -eux

From cc8088bc0ffb03003c928af0258d4432f3e1419e Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 00:43:04 -0700
Subject: [PATCH 03/14] update commits

---
 recipes/gpt-oss-120b/README.md               | 21 +++++++
 recipes/gpt-oss-120b/trtllm/agg/bench.yaml   | 59 +++++++++++---------
 recipes/gpt-oss-120b/trtllm/agg/config.yaml  | 16 ++----
 recipes/gpt-oss-120b/trtllm/agg/deploy.yaml  | 11 ++--
 recipes/gpt-oss-120b/trtllm/agg/service.yaml | 13 -----
 5 files changed, 64 insertions(+), 56 deletions(-)
 create mode 100644 recipes/gpt-oss-120b/README.md
 delete mode 100644 recipes/gpt-oss-120b/trtllm/agg/service.yaml

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
new file mode 100644
index 000000000000..06e448d332cb
--- /dev/null
+++ b/recipes/gpt-oss-120b/README.md
@@ -0,0 +1,21 @@
+Note:
+
+- This recipe is for gpt-oss-120b in aggregated mode.
+
+# Running the recipe
+```bash
+./run.sh --model gpt-oss-120b --framework trtllm agg
+```
+
+# Images
+
+This recipe uses the following container images using custom commits. You might need to build the images to reproduce the benchmark.
+
+* aiperf
+ Based on commit [70af59489df24a601dba57604a7341966150b366](https://github.com/ai-dynamo/aiperf/commit/70af59489df24a601dba57604a7341966150b366)
+
+* dynamo trtllm runtime for arm64
+based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f)
+```
+nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
+```
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml
index 7eb9da4158fc..02843db6851f 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml
@@ -28,8 +28,9 @@ spec:
         - /bin/sh
         - -c
         - |
-          #TODO: this can be baked into the aiperf image
-          apt-get update && apt-get install -y curl jq procps
+          apt-get update && apt-get install -y curl jq procps git && apt-get clean
+          pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
+          echo "aiperf installation completed";
           sysctl -w net.ipv4.ip_local_port_range="1024 65000"
           cat /proc/sys/net/ipv4/ip_local_port_range
           export COLUMNS=200
@@ -38,7 +39,7 @@ spec:
           wait_for_model_ready() {
             echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
             while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
-                echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting 5s..."
+                echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
                 sleep 5
             done
             echo "✅ Model '$TARGET_MODEL' is now available!"
@@ -52,11 +53,11 @@ spec:
             key=concurrency_${concurrency}
             export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
             mkdir -p "$ARTIFACT_DIR"
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
-                --tokenizer ~/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a  \
-                --endpoint-type chat \
-                --endpoint /v1/chat/completions \
+                --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a  \
+                --endpoint-type chat  --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \
@@ -67,12 +68,13 @@ spec:
                 --extra-inputs "{\"min_tokens\":$osl}" \
                 --extra-inputs "{\"ignore_eos\":true}" \
                 --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
+                --extra-inputs "{\"repetition_penalty\":1.0}" \
+                --extra-inputs "{\"temperature\": 0.0}" \
                 --concurrency $concurrency \
-                --request-count $((2*concurrency)) \
+                --request-count $((10*concurrency)) \
                 --warmup-request-count $concurrency \
-                --conversation-num 1 \
+                --conversation-num 12800 \
                 --random-seed 100 \
-                --request-rate 100000 \
                 --workers-max 252 \
                 -H 'Authorization: Bearer NOT USED' \
                 -H 'Accept: text/event-stream'\
@@ -84,10 +86,15 @@ spec:
           #### Actual execution ####
           wait_for_model_ready
           mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
+          # Calculate total concurrency based on per-GPU concurrency and GPU count
+          TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
+          echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
           # Write input_config.json
           cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
           {
             "gpu_count": $DEPLOYMENT_GPU_COUNT,
+            "concurrency_per_gpu": $CONCURRENCY_PER_GPU,
+            "total_concurrency": $TOTAL_CONCURRENCY,
             "mode": "$DEPLOYMENT_MODE",
             "isl": $ISL,
             "osl": $OSL,
@@ -95,26 +102,25 @@ spec:
             "model endpoint": "$TARGET_MODEL"
           }
           EOF
-          # Run perf for each concurrency
-          for concurrency in $CONCURRENCIES; do
-            run_perf $concurrency $ISL $OSL
-            sleep 10
-          done
+
+          # Run perf with calculated total concurrency
+          run_perf $TOTAL_CONCURRENCY $ISL $OSL
+          echo "done with concurrency $TOTAL_CONCURRENCY"
         env:
         - name: TARGET_MODEL
           value: openai/gpt-oss-120b
         - name: ENDPOINT
           value: gpt-oss-agg-frontend:8000
-        - name: CONCURRENCIES
-          value: "130000"
+        - name: CONCURRENCY_PER_GPU
+          value: "900"
+        - name: DEPLOYMENT_GPU_COUNT
+          value: "72"
         - name: ISL
-          value: "16"
+          value: "128"
         - name: OSL
           value: "1000"
         - name: DEPLOYMENT_MODE
           value: agg
-        - name: DEPLOYMENT_GPU_COUNT
-          value: "72"
         - name: AIPERF_HTTP_CONNECTION_LIMIT
           value: "252"
         - name: JOB_NAME
@@ -123,22 +129,23 @@ spec:
               apiVersion: v1
               fieldPath: metadata.labels['job-name']
         - name: ROOT_ARTIFACT_DIR
-          value: /root/.cache/huggingface/hub/perf
-        image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:aiperf-lim-2-31b447d6
+          value: /model-cache/perf
+        - name: HF_HOME
+          value: /model-cache
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        image: python:3.12-slim
         imagePullPolicy: IfNotPresent
         name: perf
-        resources: {}
         securityContext:
           privileged: true
         volumeMounts:
         - name: model-cache
-          mountPath: /root/.cache/huggingface/hub
-        workingDir: /workspace/components/backends/vllm
-      dnsPolicy: ClusterFirst
+          mountPath: /model-cache
+        workingDir: /workspace
       imagePullSecrets:
       - name: nvcrimagepullsecret
       restartPolicy: Never
-      schedulerName: default-scheduler
       volumes:
       - name: model-cache
         persistentVolumeClaim:
diff --git a/recipes/gpt-oss-120b/trtllm/agg/config.yaml b/recipes/gpt-oss-120b/trtllm/agg/config.yaml
index 5f50bb24b98f..2d1701bc3bdc 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/config.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/config.yaml
@@ -6,20 +6,12 @@ metadata:
   name: llm-config
 data:
   config.yaml: |
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
     enable_attention_dp: true
-    build_config:
-      max_batch_size: 640
-      max_num_tokens: 20000
-    moe_config:
-        backend: CUTLASS
     cuda_graph_config:
-        max_batch_size: 640
+        max_batch_size: 800
         enable_padding: true
     kv_cache_config:
-      free_gpu_memory_fraction: 0.9
       enable_block_reuse: false
-    print_iter_log: false
-    stream_interval: 50
-    use_torch_sampler: true
\ No newline at end of file
+    stream_interval: 20
+    moe_config:
+        backend: CUTLASS
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
index 6bfde411ee35..9cfcabc4ab98 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -27,7 +27,7 @@ spec:
           command:
           - /bin/sh
           - -c
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:gpt-oss-dynamo-nvl72-debug-trtllm-tot
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
       pvc:
         create: false
         mountPoint: /model-store
@@ -60,16 +60,17 @@ spec:
           args:
           - |
             python3 -m dynamo.trtllm \
-              --model-path "/model-store/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a" \
+              --model-path "${MODEL_PATH}" \
               --served-model-name "openai/gpt-oss-120b" \
               --extra-engine-args "${ENGINE_ARGS}" \
-              --max-num-tokens 20000 \
-              --max-batch-size 640 \
+              --tensor-parallel-size 4 \
+              --expert-parallel-size 4 \
+              --max-batch-size 800 \
               --free-gpu-memory-fraction 0.9
           command:
           - /bin/sh
           - -c
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:gpt-oss-dynamo-nvl72-debug-trtllm-tot
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
           env:
           - name: TRTLLM_ENABLE_PDL
             value: "1"
diff --git a/recipes/gpt-oss-120b/trtllm/agg/service.yaml b/recipes/gpt-oss-120b/trtllm/agg/service.yaml
deleted file mode 100644
index a1bee8e20e79..000000000000
--- a/recipes/gpt-oss-120b/trtllm/agg/service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: v1
-kind: Service
-metadata:
-  name: gpt-oss-agg-trtllmworker
-spec:
-  selector:
-    nvidia.com/selector: gpt-oss-agg-trtllmworker
-  ports:
-    - protocol: TCP
-      port: 8000
-      targetPort: 8000

From 27f8fd0fced077fa644f88bf2fc482dbb6a2fade Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 15:15:25 -0700
Subject: [PATCH 04/14] fix: remove antiaffinity

---
 recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
index 9cfcabc4ab98..4020c6a71cf1 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -47,15 +47,6 @@ spec:
                   operator: In
                   values:
                   - "true"
-          podAntiAffinity:
-            requiredDuringSchedulingIgnoredDuringExecution:
-            - labelSelector:
-                matchExpressions:
-                - key: nvidia.com/dynamo-graph-deployment-name
-                  operator: In
-                  values:
-                  - gpt-oss-agg
-              topologyKey: kubernetes.io/hostname
         mainContainer:
           args:
           - |

From 5a42f4955eda57a17d1444f752d31690fd9b7e5b Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 17:07:10 -0700
Subject: [PATCH 05/14] freeze version:

---
 recipes/gpt-oss-120b/model-cache/model-download.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml
index df898a792424..f6554cf83886 100644
--- a/recipes/gpt-oss-120b/model-cache/model-download.yaml
+++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml
@@ -28,11 +28,13 @@ spec:
               value: /model-store
             - name: HF_HUB_ENABLE_HF_TRANSFER
               value: "1"
+            - name: MODEL_REVISION
+              value: b5c939de8f754692c1647ca79fbf85e8c1e70f8a
           args:
             - |
               set -eux
               pip install --no-cache-dir huggingface_hub hf_transfer
-              hf download $MODEL_NAME --exclude "original/*" --exclude "metal/*"
+              hf download $MODEL_NAME --revision $MODEL_REVISION --exclude "original/*" --exclude "metal/*"
           volumeMounts:
             - name: model-cache
               mountPath: /model-store

From ac596ab0dc55c253e933cff49010f52b7bffac22 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 17:21:48 -0700
Subject: [PATCH 06/14] fix

---
 recipes/gpt-oss-120b/README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
index 06e448d332cb..0ce400bc8466 100644
--- a/recipes/gpt-oss-120b/README.md
+++ b/recipes/gpt-oss-120b/README.md
@@ -9,10 +9,8 @@ Note:
 
 # Images
 
-This recipe uses the following container images using custom commits. You might need to build the images to reproduce the benchmark.
-
-* aiperf
- Based on commit [70af59489df24a601dba57604a7341966150b366](https://github.com/ai-dynamo/aiperf/commit/70af59489df24a601dba57604a7341966150b366)
+This recipe uses the following trtllm container image based on pre release/0.5.1 commit.
+You might need to build the images to reproduce the benchmark.
 
 * dynamo trtllm runtime for arm64
 based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f)

From 0cbecad691feda21141a3ff79eb67471fee53094 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 17:42:24 -0700
Subject: [PATCH 07/14] fix

---
 recipes/gpt-oss-120b/README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
index 0ce400bc8466..01a009822d5d 100644
--- a/recipes/gpt-oss-120b/README.md
+++ b/recipes/gpt-oss-120b/README.md
@@ -13,7 +13,16 @@ This recipe uses the following trtllm container image based on pre release/0.5.1
 You might need to build the images to reproduce the benchmark.
 
 * dynamo trtllm runtime for arm64
-based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f)
+Below image is built based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f)
 ```
 nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
+```
+
+Steps to build the image:
+```bash
+git clone https://github.com/ai-dynamo/dynamo.git
+cd dynamo
+git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f
+
+./container/build.sh --framework TRTLLM --target runtime
 ```
\ No newline at end of file

From d67ef6efe2451cd2b10bfae97c7bf5e80151b90b Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 17:47:02 -0700
Subject: [PATCH 08/14] fix

---
 recipes/gpt-oss-120b/README.md | 50 ++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
index 01a009822d5d..e88ed0a640af 100644
--- a/recipes/gpt-oss-120b/README.md
+++ b/recipes/gpt-oss-120b/README.md
@@ -1,28 +1,48 @@
-Note:
+# GPT-OSS-120B Recipe Guide
 
-- This recipe is for gpt-oss-120b in aggregated mode.
+This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup.
+
+## Quick Start
+
+To run the model, simply execute this command in your terminal:
 
-# Running the recipe
 ```bash
 ./run.sh --model gpt-oss-120b --framework trtllm agg
 ```
 
-# Images
+## System Requirements
+
+### Model Download
 
-This recipe uses the following trtllm container image based on pre release/0.5.1 commit.
-You might need to build the images to reproduce the benchmark.
+### Container Image
+This recipe was tested with dynamo trtllm runtime container for ARM64 processors.
 
-* dynamo trtllm runtime for arm64
-Below image is built based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f)
+**Pre-built Image:**
 ```
 nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
 ```
 
-Steps to build the image:
-```bash
-git clone https://github.com/ai-dynamo/dynamo.git
-cd dynamo
-git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f
+### Building Your Own Image (Optional)
+
+If you need to build the container image yourself (for example, if you're using different hardware or want to customize the setup):
+
+1. **Clone the repository:**
+   ```bash
+   git clone https://github.com/ai-dynamo/dynamo.git
+   cd dynamo
+   ```
+
+2. **Switch to the specific version:**
+   ```bash
+   git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f
+   ```
+
+3. **Build the container:**
+   ```bash
+   ./container/build.sh --framework TRTLLM --target runtime
+   ```
+
+## Notes
+1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup.
 
-./container/build.sh --framework TRTLLM --target runtime
-```
\ No newline at end of file
+2. storage class is not specified in the recipe, you need to specify it in the `deploy.yaml` file.
\ No newline at end of file

From 123a9fc25496a889e2c52364d91ff50aae5149f0 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 17:52:25 -0700
Subject: [PATCH 09/14] fix

---
 recipes/gpt-oss-120b/README.md | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
index e88ed0a640af..b4351fe47f38 100644
--- a/recipes/gpt-oss-120b/README.md
+++ b/recipes/gpt-oss-120b/README.md
@@ -2,17 +2,34 @@
 
 This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup.
 
+## Prerequisites
+
+follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token.
+
 ## Quick Start
 
 To run the model, simply execute this command in your terminal:
 
 ```bash
+cd recipe
 ./run.sh --model gpt-oss-120b --framework trtllm agg
 ```
 
-## System Requirements
+## (Alternative) Step by Step Guide
+
+### 1. Download the Model
 
-### Model Download
+```bash
+cd recipes/gpt-oss-120b
+kubectl apply -n $NAMESPACE -f ./model-cache
+```
+
+### 2. Deploy and Benchmark the Model
+
+```bash
+cd recipes/gpt-oss-120b
+kubectl apply -n $NAMESPACE -f ./trtllm/agg
+```
 
 ### Container Image
 This recipe was tested with dynamo trtllm runtime container for ARM64 processors.

From d05e7ca45de16d8f0110c983aa3165aede8fa458 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 18:05:46 -0700
Subject: [PATCH 10/14] fix

---
 recipes/gpt-oss-120b/README.md              | 23 +--------------------
 recipes/gpt-oss-120b/trtllm/agg/deploy.yaml |  4 ++--
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
index b4351fe47f38..4e93dc5755e3 100644
--- a/recipes/gpt-oss-120b/README.md
+++ b/recipes/gpt-oss-120b/README.md
@@ -34,31 +34,10 @@ kubectl apply -n $NAMESPACE -f ./trtllm/agg
 ### Container Image
 This recipe was tested with dynamo trtllm runtime container for ARM64 processors.
 
-**Pre-built Image:**
 ```
-nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
+nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
 ```
 
-### Building Your Own Image (Optional)
-
-If you need to build the container image yourself (for example, if you're using different hardware or want to customize the setup):
-
-1. **Clone the repository:**
-   ```bash
-   git clone https://github.com/ai-dynamo/dynamo.git
-   cd dynamo
-   ```
-
-2. **Switch to the specific version:**
-   ```bash
-   git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f
-   ```
-
-3. **Build the container:**
-   ```bash
-   ./container/build.sh --framework TRTLLM --target runtime
-   ```
-
 ## Notes
 1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup.
 
diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
index 4020c6a71cf1..c19a275eebf2 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -27,7 +27,7 @@ spec:
           command:
           - /bin/sh
           - -c
-          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
+          image: my-registry/trtllm-runtime:my-tag
       pvc:
         create: false
         mountPoint: /model-store
@@ -61,7 +61,7 @@ spec:
           command:
           - /bin/sh
           - -c
-          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64
+          image: my-registry/trtllm-runtime:my-tag
           env:
           - name: TRTLLM_ENABLE_PDL
             value: "1"

From 11554a9d3bc92c8c28f0b272cda87627895600fc Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 19:28:57 -0700
Subject: [PATCH 11/14] fix

---
 recipes/gpt-oss-120b/README.md              | 8 ++++++++
 recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
index 4e93dc5755e3..5ba4b8f49859 100644
--- a/recipes/gpt-oss-120b/README.md
+++ b/recipes/gpt-oss-120b/README.md
@@ -34,10 +34,18 @@ kubectl apply -n $NAMESPACE -f ./trtllm/agg
 ### Container Image
 This recipe was tested with dynamo trtllm runtime container for ARM64 processors.
 
+**Important Note:**
+
+Before dynamo v0.5.1 release, following container image is supported:
 ```
 nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
 ```
 
+After dynamo v0.5.1 release, following container image will be supported:
+```
+nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1
+```
+
 ## Notes
 1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup.
 
diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
index c19a275eebf2..608725b1f526 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -27,7 +27,7 @@ spec:
           command:
           - /bin/sh
           - -c
-          image: my-registry/trtllm-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
       pvc:
         create: false
         mountPoint: /model-store
@@ -61,7 +61,7 @@ spec:
           command:
           - /bin/sh
           - -c
-          image: my-registry/trtllm-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
           env:
           - name: TRTLLM_ENABLE_PDL
             value: "1"

From c737358297f77edf1ff782c6164e1ff234e836e0 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 19:30:51 -0700
Subject: [PATCH 12/14] fix

---
 recipes/gpt-oss-120b/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
index 5ba4b8f49859..36ab361503fd 100644
--- a/recipes/gpt-oss-120b/README.md
+++ b/recipes/gpt-oss-120b/README.md
@@ -4,7 +4,7 @@ This guide will help you run the GPT-OSS-120B language model using Dynamo's opti
 
 ## Prerequisites
 
-follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token.
+Follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token.
 
 ## Quick Start
 

From 9e0e434de3a79cb7bed3d726c06ddbf4b4f184e1 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Mon, 29 Sep 2025 20:00:06 -0700
Subject: [PATCH 13/14] fix

---
 recipes/gpt-oss-120b/trtllm/agg/{bench.yaml => perf.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename recipes/gpt-oss-120b/trtllm/agg/{bench.yaml => perf.yaml} (100%)

diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml
similarity index 100%
rename from recipes/gpt-oss-120b/trtllm/agg/bench.yaml
rename to recipes/gpt-oss-120b/trtllm/agg/perf.yaml

From 4cfa0c59c760d528370195643db4cc71edc0d110 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Thu, 2 Oct 2025 12:51:02 -0700
Subject: [PATCH 14/14] update

---
 recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
index fb5ba0bf2801..6f725af31088 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -99,6 +99,4 @@ spec:
         limits:
           gpu: "4"
         requests:
-          gpu: "4"
-      sharedMemory:
-        size: 80Gi
\ No newline at end of file
+          gpu: "4"
\ No newline at end of file