diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md
new file mode 100644
index 000000000000..36ab361503fd
--- /dev/null
+++ b/recipes/gpt-oss-120b/README.md
@@ -0,0 +1,52 @@
+# GPT-OSS-120B Recipe Guide
+
+This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup.
+
+## Prerequisites
+
+Follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token.
+
+## Quick Start
+
+To run the model, simply execute this command in your terminal:
+
+```bash
+cd recipe
+./run.sh --model gpt-oss-120b --framework trtllm agg
+```
+
+## (Alternative) Step by Step Guide
+
+### 1. Download the Model
+
+```bash
+cd recipes/gpt-oss-120b
+kubectl apply -n $NAMESPACE -f ./model-cache
+```
+
+### 2. Deploy and Benchmark the Model
+
+```bash
+cd recipes/gpt-oss-120b
+kubectl apply -n $NAMESPACE -f ./trtllm/agg
+```
+
+### Container Image
+This recipe was tested with dynamo trtllm runtime container for ARM64 processors.
+
+**Important Note:**
+
+Before dynamo v0.5.1 release, following container image is supported:
+```
+nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
+```
+
+After dynamo v0.5.1 release, following container image will be supported:
+```
+nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1
+```
+
+## Notes
+1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup.
+
+2. storage class is not specified in the recipe, you need to specify it in the `deploy.yaml` file.
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/model-cache/model-cache.yaml b/recipes/gpt-oss-120b/model-cache/model-cache.yaml
new file mode 100644
index 000000000000..790c46955e75
--- /dev/null
+++ b/recipes/gpt-oss-120b/model-cache/model-cache.yaml
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: model-cache
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 100Gi
+  storageClassName: "your-storage-class-name"
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml
new file mode 100644
index 000000000000..f6554cf83886
--- /dev/null
+++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: model-download
+spec:
+  backoffLimit: 3
+  completions: 1
+  parallelism: 1
+  template:
+    metadata:
+      labels:
+        app: model-download
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: model-download
+          image: python:3.10-slim
+          command: ["sh", "-c"]
+          envFrom:
+            - secretRef:
+                name: hf-token-secret
+          env:
+            - name: MODEL_NAME
+              value: openai/gpt-oss-120b
+            - name: HF_HOME
+              value: /model-store
+            - name: HF_HUB_ENABLE_HF_TRANSFER
+              value: "1"
+            - name: MODEL_REVISION
+              value: b5c939de8f754692c1647ca79fbf85e8c1e70f8a
+          args:
+            - |
+              set -eux
+              pip install --no-cache-dir huggingface_hub hf_transfer
+              hf download $MODEL_NAME --revision $MODEL_REVISION --exclude "original/*" --exclude "metal/*"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /model-store
+      volumes:
+      - name: model-cache
+        persistentVolumeClaim:
+          claimName: model-cache
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/trtllm/agg/config.yaml b/recipes/gpt-oss-120b/trtllm/agg/config.yaml
index 5f50bb24b98f..2d1701bc3bdc 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/config.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/config.yaml
@@ -6,20 +6,12 @@ metadata:
   name: llm-config
 data:
   config.yaml: |
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
     enable_attention_dp: true
-    build_config:
-      max_batch_size: 640
-      max_num_tokens: 20000
-    moe_config:
-        backend: CUTLASS
     cuda_graph_config:
-        max_batch_size: 640
+        max_batch_size: 800
         enable_padding: true
     kv_cache_config:
-      free_gpu_memory_fraction: 0.9
       enable_block_reuse: false
-    print_iter_log: false
-    stream_interval: 50
-    use_torch_sampler: true
\ No newline at end of file
+    stream_interval: 20
+    moe_config:
+        backend: CUTLASS
\ No newline at end of file
diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
index 7dd176f3ecb6..d3c8479f1b12 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -3,61 +3,91 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: gpt-oss-agg-shm
+  name: gpt-oss-agg
 spec:
   backendFramework: trtllm
   services:
+    Frontend:
+      componentType: frontend
+      dynamoNamespace: gpt-oss-agg
+      extraPodSpec:
+        affinity:
+          podAntiAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                - key: nvidia.com/dynamo-graph-deployment-name
+                  operator: In
+                  values:
+                  - gpt-oss-agg-frontend
+              topologyKey: kubernetes.io/hostname
+        mainContainer:
+          args:
+          - python3 -m dynamo.frontend --router-mode round-robin --http-port 8000
+          command:
+          - /bin/sh
+          - -c
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1
+      pvc:
+        create: false
+        mountPoint: /model-store
+        name: model-cache
+      replicas: 1
     TrtllmWorker:
       componentType: main
-      dynamoNamespace: gpt-oss-agg-shm
+      dynamoNamespace: gpt-oss-agg
       envFromSecret: hf-token-secret
-      pvc:
-        create: false
-        name: model-cache-oss-gpt120b
-        mountPoint: /root/.cache/huggingface
       sharedMemory:
         size: 80Gi
       extraPodSpec:
-        tolerations:
-          - key: "dedicated"
-            operator: "Equal"
-            value: "user-workload"
-            effect: "NoSchedule"
-          - key: "dedicated"
-            operator: "Equal"
-            value: "user-workload"
-            effect: "NoExecute"
         affinity:
           nodeAffinity:
             requiredDuringSchedulingIgnoredDuringExecution:
               nodeSelectorTerms:
-                - matchExpressions:
-                    - key: nvidia.com/gpu.present
-                      operator: In
-                      values:
-                        - "true"
+              - matchExpressions:
+                - key: nvidia.com/gpu.present
+                  operator: In
+                  values:
+                  - "true"
         mainContainer:
           args:
           - |
-            export TRTLLM_ENABLE_PDL=1
-            export TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True
-            export ENGINE_ARGS=${AGG_ENGINE_ARGS:-"/root/.cache/huggingface/gpt-oss-120b/config.yaml"}
-            export MODEL_PATH=${MODEL_PATH:-"/root/.cache/huggingface/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"}
-            export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
-            trap 'echo Cleaning up...; kill 0' EXIT
-            python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
             python3 -m dynamo.trtllm \
-              --model-path "$MODEL_PATH" \
-              --served-model-name "$SERVED_MODEL_NAME" \
-              --extra-engine-args "$ENGINE_ARGS" \
-              --max-num-tokens 20000 \
-              --max-batch-size 640 \
+              --model-path "${MODEL_PATH}" \
+              --served-model-name "openai/gpt-oss-120b" \
+              --extra-engine-args "${ENGINE_ARGS}" \
+              --tensor-parallel-size 4 \
+              --expert-parallel-size 4 \
+              --max-batch-size 800 \
               --free-gpu-memory-fraction 0.9
           command:
           - /bin/sh
           - -c
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.1
+          env:
+          - name: TRTLLM_ENABLE_PDL
+            value: "1"
+          - name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL
+            value: "True"
+          - name: SERVED_MODEL_NAME
+            value: "openai/gpt-oss-120b"
+          - name: ENGINE_ARGS
+            value: "/opt/dynamo/configs/config.yaml"
+          - name: MODEL_PATH
+            value: "/model-store/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"
+          volumeMounts:
+          - mountPath: /opt/dynamo/configs
+            name: llm-config
+            readOnly: true
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1
           workingDir: /workspace/components/backends/trtllm
+        volumes:
+        - configMap:
+            name: llm-config
+          name: llm-config
+      pvc:
+        create: false
+        mountPoint: /model-store
+        name: model-cache
       replicas: 1
       resources:
         limits:
diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml
similarity index 51%
rename from recipes/gpt-oss-120b/trtllm/agg/bench.yaml
rename to recipes/gpt-oss-120b/trtllm/agg/perf.yaml
index 76a1fd45126e..eed5d69addbf 100644
--- a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml
@@ -11,47 +11,35 @@ spec:
   template:
     metadata:
       labels:
-        app: oss-gpt120b
+        app: oss-gpt120b-bench
     spec:
-      restartPolicy: Never
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                  - key: nvidia.com/dynamo-graph-deployment-name
+                    operator: In
+                    values:
+                      - gpt-oss-agg
+              topologyKey: kubernetes.io/hostname
       containers:
-      - name: perf
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.1
-        workingDir: /workspace/components/backends/vllm
-        env:
-          - name: TARGET_MODEL
-            value: openai/gpt-oss-120b
-          - name: ENDPOINT
-            value: gpt-oss-agg-trtllmworker:8000
-          - name: CONCURRENCIES
-            value: "13000 13500 1400"
-          - name: ISL
-            value: "16"
-          - name: OSL
-            value: "1000"
-          - name: DEPLOYMENT_MODE
-            value: "agg"
-          - name: DEPLOYMENT_GPU_COUNT
-            value: "32"
-          - name: JOB_NAME
-            valueFrom:
-              fieldRef:
-                fieldPath: metadata.labels['job-name']
-          - name: ROOT_ARTIFACT_DIR
-            value: /root/.cache/huggingface/hub/perf
-        command:
+      - command:
         - /bin/sh
         - -c
         - |
-          #TODO: this can be baked into the aiperf image
-          apt-get update && apt-get install -y curl jq
+          apt-get update && apt-get install -y curl jq procps git && apt-get clean
+          pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
+          echo "aiperf installation completed";
+          sysctl -w net.ipv4.ip_local_port_range="1024 65000"
+          cat /proc/sys/net/ipv4/ip_local_port_range
           export COLUMNS=200
           EPOCH=$(date +%s)
           ## utility functions -- can be moved to a bash script / configmap
           wait_for_model_ready() {
             echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
             while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
-                echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting 5s..."
+                echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
                 sleep 5
             done
             echo "✅ Model '$TARGET_MODEL' is now available!"
@@ -65,11 +53,11 @@ spec:
             key=concurrency_${concurrency}
             export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
             mkdir -p "$ARTIFACT_DIR"
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
-                --tokenizer ~/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a  \
-                --endpoint-type chat \
-                --endpoint /v1/chat/completions \
+                --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a  \
+                --endpoint-type chat  --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \
@@ -80,13 +68,14 @@ spec:
                 --extra-inputs "{\"min_tokens\":$osl}" \
                 --extra-inputs "{\"ignore_eos\":true}" \
                 --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
+                --extra-inputs "{\"repetition_penalty\":1.0}" \
+                --extra-inputs "{\"temperature\": 0.0}" \
                 --concurrency $concurrency \
-                --request-count $((3*concurrency)) \
+                --request-count $((10*concurrency)) \
                 --warmup-request-count $concurrency \
-                --conversation-num 1 \
+                --conversation-num 12800 \
                 --random-seed 100 \
-                --request-rate 100000 \
-                --workers-max 128 \
+                --workers-max 252 \
                 -H 'Authorization: Bearer NOT USED' \
                 -H 'Accept: text/event-stream'\
                 --record-processors 32 \
@@ -97,10 +86,15 @@ spec:
           #### Actual execution ####
           wait_for_model_ready
           mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
+          # Calculate total concurrency based on per-GPU concurrency and GPU count
+          TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
+          echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
           # Write input_config.json
           cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
           {
             "gpu_count": $DEPLOYMENT_GPU_COUNT,
+            "concurrency_per_gpu": $CONCURRENCY_PER_GPU,
+            "total_concurrency": $TOTAL_CONCURRENCY,
             "mode": "$DEPLOYMENT_MODE",
             "isl": $ISL,
             "osl": $OSL,
@@ -108,16 +102,50 @@ spec:
             "model endpoint": "$TARGET_MODEL"
           }
           EOF
-          # Run perf for each concurrency
-          for concurrency in $CONCURRENCIES; do
-            run_perf $concurrency $ISL $OSL
-            sleep 10
-          done
+
+          # Run perf with calculated total concurrency
+          run_perf $TOTAL_CONCURRENCY $ISL $OSL
+          echo "done with concurrency $TOTAL_CONCURRENCY"
+        env:
+        - name: TARGET_MODEL
+          value: openai/gpt-oss-120b
+        - name: ENDPOINT
+          value: gpt-oss-agg-frontend:8000
+        - name: CONCURRENCY_PER_GPU
+          value: "900"
+        - name: DEPLOYMENT_GPU_COUNT
+          value: "4"
+        - name: ISL
+          value: "128"
+        - name: OSL
+          value: "1000"
+        - name: DEPLOYMENT_MODE
+          value: agg
+        - name: AIPERF_HTTP_CONNECTION_LIMIT
+          value: "252"
+        - name: JOB_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['job-name']
+        - name: ROOT_ARTIFACT_DIR
+          value: /model-cache/perf
+        - name: HF_HOME
+          value: /model-cache
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        image: python:3.12-slim
+        imagePullPolicy: IfNotPresent
+        name: perf
+        securityContext:
+          privileged: true
         volumeMounts:
         - name: model-cache
-          mountPath: /root/.cache/huggingface
+          mountPath: /model-cache
+        workingDir: /workspace
       imagePullSecrets:
-        - name: nvcrimagepullsecret
+      - name: nvcrimagepullsecret
+      restartPolicy: Never
       volumes:
       - name: model-cache
         persistentVolumeClaim:
diff --git a/recipes/gpt-oss-120b/trtllm/agg/service.yaml b/recipes/gpt-oss-120b/trtllm/agg/service.yaml
deleted file mode 100644
index a1bee8e20e79..000000000000
--- a/recipes/gpt-oss-120b/trtllm/agg/service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: v1
-kind: Service
-metadata:
-  name: gpt-oss-agg-trtllmworker
-spec:
-  selector:
-    nvidia.com/selector: gpt-oss-agg-trtllmworker
-  ports:
-    - protocol: TCP
-      port: 8000
-      targetPort: 8000