From 15ac82a494dbcf2a058911e8bbf2aff2790a5173 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 24 Sep 2025 15:58:02 -0700 Subject: [PATCH 01/14] add model cache and use public aiperf image Signed-off-by: Biswa Panda --- .../gpt-oss-120b/model-cache/model-cache.yaml | 13 +++ .../model-cache/model-download.yaml | 47 ++++++++ recipes/gpt-oss-120b/trtllm/agg/bench.yaml | 85 ++++++++------ recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 108 ++++++++++++------ 4 files changed, 186 insertions(+), 67 deletions(-) create mode 100644 recipes/gpt-oss-120b/model-cache/model-cache.yaml create mode 100644 recipes/gpt-oss-120b/model-cache/model-download.yaml diff --git a/recipes/gpt-oss-120b/model-cache/model-cache.yaml b/recipes/gpt-oss-120b/model-cache/model-cache.yaml new file mode 100644 index 000000000000..790c46955e75 --- /dev/null +++ b/recipes/gpt-oss-120b/model-cache/model-cache.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-cache +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 100Gi + storageClassName: "your-storage-class-name" \ No newline at end of file diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml new file mode 100644 index 000000000000..3471788195c7 --- /dev/null +++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: batch/v1 +kind: Job +metadata: + name: model-download +spec: + backoffLimit: 3 + completions: 1 + parallelism: 1 + template: + metadata: + labels: + app: model-download + spec: + restartPolicy: Never + containers: + - name: model-download + image: python:3.10-slim + command: ["sh", "-c"] + envFrom: + - secretRef: + name: hf-token-secret + env: + - name: MODEL_NAME + value: openai/gpt-oss-120b + - name: HF_HOME + value: /model-store + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: HF_TOKEN + args: + - | + set -eux + pip install --no-cache-dir huggingface_hub hf_transfer + hf download $MODEL_NAME --exclude "original/*" --exclude "metal/*" + volumeMounts: + - name: model-cache + mountPath: /model-store + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: model-cache \ No newline at end of file diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml index 6d1a3c422c96..7eb9da4158fc 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml @@ -11,40 +11,27 @@ spec: template: metadata: labels: - app: oss-gpt120b + app: oss-gpt120b-bench spec: - restartPolicy: Never + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: nvidia.com/dynamo-graph-deployment-name + operator: In + values: + - gpt-oss-agg + topologyKey: kubernetes.io/hostname containers: - - name: perf - image: my-registry/vllm-runtime:my-tag - workingDir: /workspace/components/backends/vllm - env: - - name: TARGET_MODEL - value: openai/gpt-oss-120b - - name: ENDPOINT - value: gpt-oss-agg-trtllmworker:8000 - - name: CONCURRENCIES - value: "13000 13500 1400" - - name: ISL - value: "16" - - name: OSL - value: "1000" - - name: DEPLOYMENT_MODE - value: "agg" - - name: DEPLOYMENT_GPU_COUNT - value: "32" - - name: JOB_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['job-name'] - - name: ROOT_ARTIFACT_DIR - value: /root/.cache/huggingface/hub/perf - command: + - command: - /bin/sh - -c - | #TODO: this can be baked into the aiperf image - apt-get update && apt-get install -y curl jq + apt-get update && apt-get install -y curl jq procps + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 EPOCH=$(date +%s) ## utility functions -- can be moved to a bash script / configmap @@ -81,12 +68,12 @@ spec: --extra-inputs "{\"ignore_eos\":true}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ --concurrency $concurrency \ - --request-count $((3*concurrency)) \ + --request-count $((2*concurrency)) \ --warmup-request-count $concurrency \ --conversation-num 1 \ --random-seed 100 \ --request-rate 100000 \ - --workers-max 128 \ + --workers-max 252 \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream'\ --record-processors 32 \ @@ -113,11 +100,45 @@ spec: run_perf $concurrency $ISL $OSL sleep 10 done + env: + - name: TARGET_MODEL + value: openai/gpt-oss-120b + - name: ENDPOINT + value: gpt-oss-agg-frontend:8000 + - name: CONCURRENCIES + value: "130000" + - name: ISL + value: "16" + - name: OSL + value: "1000" + - name: DEPLOYMENT_MODE + value: agg + - name: DEPLOYMENT_GPU_COUNT + value: "72" + - name: AIPERF_HTTP_CONNECTION_LIMIT + value: "252" + - name: JOB_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.labels['job-name'] + - name: ROOT_ARTIFACT_DIR + value: /root/.cache/huggingface/hub/perf + image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:aiperf-lim-2-31b447d6 + imagePullPolicy: IfNotPresent + name: perf + resources: {} + securityContext: + privileged: true volumeMounts: - name: model-cache - mountPath: /root/.cache/huggingface + mountPath: /root/.cache/huggingface/hub + workingDir: /workspace/components/backends/vllm + dnsPolicy: ClusterFirst imagePullSecrets: - - name: nvcrimagepullsecret + - name: nvcrimagepullsecret + restartPolicy: Never + schedulerName: default-scheduler volumes: - name: model-cache persistentVolumeClaim: diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml index 53080c4288fa..6bfde411ee35 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml @@ -3,64 +3,102 @@ apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: - name: gpt-oss-agg-shm + name: gpt-oss-agg spec: backendFramework: trtllm services: + Frontend: + componentType: frontend + dynamoNamespace: gpt-oss-agg + extraPodSpec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: nvidia.com/dynamo-graph-deployment-name + operator: In + values: + - gpt-oss-agg-frontend + topologyKey: kubernetes.io/hostname + mainContainer: + args: + - python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 + command: + - /bin/sh + - -c + image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:gpt-oss-dynamo-nvl72-debug-trtllm-tot + pvc: + create: false + mountPoint: /model-store + name: model-cache + replicas: 18 TrtllmWorker: componentType: main - dynamoNamespace: gpt-oss-agg-shm + dynamoNamespace: gpt-oss-agg envFromSecret: hf-token-secret - pvc: - create: false - name: model-cache-oss-gpt120b - mountPoint: /root/.cache/huggingface - sharedMemory: - size: 80Gi extraPodSpec: - tolerations: - - key: "dedicated" - operator: "Equal" - value: "user-workload" - effect: "NoSchedule" - - key: "dedicated" - operator: "Equal" - value: "user-workload" - effect: "NoExecute" affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: nvidia.com/gpu.present - operator: In - values: - - "true" + - matchExpressions: + - key: nvidia.com/gpu.present + operator: In + values: + - "true" + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: nvidia.com/dynamo-graph-deployment-name + operator: In + values: + - gpt-oss-agg + topologyKey: kubernetes.io/hostname mainContainer: args: - | - export TRTLLM_ENABLE_PDL=1 - export TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True - export ENGINE_ARGS=${AGG_ENGINE_ARGS:-"/root/.cache/huggingface/gpt-oss-120b/config.yaml"} - export MODEL_PATH=${MODEL_PATH:-"/root/.cache/huggingface/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"} - export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} - trap 'echo Cleaning up...; kill 0' EXIT - python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & python3 -m dynamo.trtllm \ - --model-path "$MODEL_PATH" \ - --served-model-name "$SERVED_MODEL_NAME" \ - --extra-engine-args "$ENGINE_ARGS" \ + --model-path "/model-store/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a" \ + --served-model-name "openai/gpt-oss-120b" \ + --extra-engine-args "${ENGINE_ARGS}" \ --max-num-tokens 20000 \ --max-batch-size 640 \ --free-gpu-memory-fraction 0.9 command: - /bin/sh - -c - image: my-registry/vllm-runtime:my-tag + image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:gpt-oss-dynamo-nvl72-debug-trtllm-tot + env: + - name: TRTLLM_ENABLE_PDL + value: "1" + - name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL + value: "True" + - name: SERVED_MODEL_NAME + value: "openai/gpt-oss-120b" + - name: ENGINE_ARGS + value: "/opt/dynamo/configs/config.yaml" + - name: MODEL_PATH + value: "/model-store/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a" + volumeMounts: + - mountPath: /opt/dynamo/configs + name: llm-config + readOnly: true workingDir: /workspace/components/backends/trtllm - replicas: 1 + volumes: + - configMap: + name: llm-config + name: llm-config + pvc: + create: false + mountPoint: /model-store + name: model-cache + replicas: 18 resources: limits: gpu: "4" requests: - gpu: "4" \ No newline at end of file + gpu: "4" + sharedMemory: + size: 80Gi \ No newline at end of file From 61e60db2bec24b5cc17a399080f8c466ba298fc8 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 24 Sep 2025 17:06:50 -0700 Subject: [PATCH 02/14] fix Signed-off-by: Biswa Panda --- recipes/gpt-oss-120b/model-cache/model-download.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml index 3471788195c7..df898a792424 100644 --- a/recipes/gpt-oss-120b/model-cache/model-download.yaml +++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml @@ -28,11 +28,6 @@ spec: value: /model-store - name: HF_HUB_ENABLE_HF_TRANSFER value: "1" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: HF_TOKEN args: - | set -eux From cc8088bc0ffb03003c928af0258d4432f3e1419e Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 00:43:04 -0700 Subject: [PATCH 03/14] update commits --- recipes/gpt-oss-120b/README.md | 21 +++++++ recipes/gpt-oss-120b/trtllm/agg/bench.yaml | 59 +++++++++++--------- recipes/gpt-oss-120b/trtllm/agg/config.yaml | 16 ++---- recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 11 ++-- recipes/gpt-oss-120b/trtllm/agg/service.yaml | 13 ----- 5 files changed, 64 insertions(+), 56 deletions(-) create mode 100644 recipes/gpt-oss-120b/README.md delete mode 100644 recipes/gpt-oss-120b/trtllm/agg/service.yaml diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md new file mode 100644 index 000000000000..06e448d332cb --- /dev/null +++ b/recipes/gpt-oss-120b/README.md @@ -0,0 +1,21 @@ +Note: + +- This recipe is for gpt-oss-120b in aggregated mode. + +# Running the recipe +```bash +./run.sh --model gpt-oss-120b --framework trtllm agg +``` + +# Images + +This recipe uses the following container images using custom commits. You might need to build the images to reproduce the benchmark. + +* aiperf + Based on commit [70af59489df24a601dba57604a7341966150b366](https://github.com/ai-dynamo/aiperf/commit/70af59489df24a601dba57604a7341966150b366) + +* dynamo trtllm runtime for arm64 +based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f) +``` +nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64 +``` \ No newline at end of file diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml index 7eb9da4158fc..02843db6851f 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/bench.yaml @@ -28,8 +28,9 @@ spec: - /bin/sh - -c - | - #TODO: this can be baked into the aiperf image - apt-get update && apt-get install -y curl jq procps + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; sysctl -w net.ipv4.ip_local_port_range="1024 65000" cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 @@ -38,7 +39,7 @@ spec: wait_for_model_ready() { echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting 5s..." + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" sleep 5 done echo "✅ Model '$TARGET_MODEL' is now available!" @@ -52,11 +53,11 @@ spec: key=concurrency_${concurrency} export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ - --tokenizer ~/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ - --endpoint-type chat \ - --endpoint /v1/chat/completions \ + --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ + --endpoint-type chat --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ @@ -67,12 +68,13 @@ spec: --extra-inputs "{\"min_tokens\":$osl}" \ --extra-inputs "{\"ignore_eos\":true}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --extra-inputs "{\"repetition_penalty\":1.0}" \ + --extra-inputs "{\"temperature\": 0.0}" \ --concurrency $concurrency \ - --request-count $((2*concurrency)) \ + --request-count $((10*concurrency)) \ --warmup-request-count $concurrency \ - --conversation-num 1 \ + --conversation-num 12800 \ --random-seed 100 \ - --request-rate 100000 \ --workers-max 252 \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream'\ @@ -84,10 +86,15 @@ spec: #### Actual execution #### wait_for_model_ready mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" # Write input_config.json cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" < Date: Mon, 29 Sep 2025 15:15:25 -0700 Subject: [PATCH 04/14] fix: remove antiaffinity --- recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml index 9cfcabc4ab98..4020c6a71cf1 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml @@ -47,15 +47,6 @@ spec: operator: In values: - "true" - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: nvidia.com/dynamo-graph-deployment-name - operator: In - values: - - gpt-oss-agg - topologyKey: kubernetes.io/hostname mainContainer: args: - | From 5a42f4955eda57a17d1444f752d31690fd9b7e5b Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 17:07:10 -0700 Subject: [PATCH 05/14] freeze version: --- recipes/gpt-oss-120b/model-cache/model-download.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml index df898a792424..f6554cf83886 100644 --- a/recipes/gpt-oss-120b/model-cache/model-download.yaml +++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml @@ -28,11 +28,13 @@ spec: value: /model-store - name: HF_HUB_ENABLE_HF_TRANSFER value: "1" + - name: MODEL_REVISION + value: b5c939de8f754692c1647ca79fbf85e8c1e70f8a args: - | set -eux pip install --no-cache-dir huggingface_hub hf_transfer - hf download $MODEL_NAME --exclude "original/*" --exclude "metal/*" + hf download $MODEL_NAME --revision $MODEL_REVISION --exclude "original/*" --exclude "metal/*" volumeMounts: - name: model-cache mountPath: /model-store From ac596ab0dc55c253e933cff49010f52b7bffac22 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 17:21:48 -0700 Subject: [PATCH 06/14] fix --- recipes/gpt-oss-120b/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md index 06e448d332cb..0ce400bc8466 100644 --- a/recipes/gpt-oss-120b/README.md +++ b/recipes/gpt-oss-120b/README.md @@ -9,10 +9,8 @@ Note: # Images -This recipe uses the following container images using custom commits. You might need to build the images to reproduce the benchmark. - -* aiperf - Based on commit [70af59489df24a601dba57604a7341966150b366](https://github.com/ai-dynamo/aiperf/commit/70af59489df24a601dba57604a7341966150b366) +This recipe uses the following trtllm container image based on pre release/0.5.1 commit. +You might need to build the images to reproduce the benchmark. * dynamo trtllm runtime for arm64 based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f) From 0cbecad691feda21141a3ff79eb67471fee53094 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 17:42:24 -0700 Subject: [PATCH 07/14] fix --- recipes/gpt-oss-120b/README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md index 0ce400bc8466..01a009822d5d 100644 --- a/recipes/gpt-oss-120b/README.md +++ b/recipes/gpt-oss-120b/README.md @@ -13,7 +13,16 @@ This recipe uses the following trtllm container image based on pre release/0.5.1 You might need to build the images to reproduce the benchmark. * dynamo trtllm runtime for arm64 -based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f) +Below image is built based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f) ``` nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64 +``` + +Steps to build the image: +```bash +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo +git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f + +./container/build.sh --framework TRTLLM --target runtime ``` \ No newline at end of file From d67ef6efe2451cd2b10bfae97c7bf5e80151b90b Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 17:47:02 -0700 Subject: [PATCH 08/14] fix --- recipes/gpt-oss-120b/README.md | 50 ++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md index 01a009822d5d..e88ed0a640af 100644 --- a/recipes/gpt-oss-120b/README.md +++ b/recipes/gpt-oss-120b/README.md @@ -1,28 +1,48 @@ -Note: +# GPT-OSS-120B Recipe Guide -- This recipe is for gpt-oss-120b in aggregated mode. +This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup. + +## Quick Start + +To run the model, simply execute this command in your terminal: -# Running the recipe ```bash ./run.sh --model gpt-oss-120b --framework trtllm agg ``` -# Images +## System Requirements + +### Model Download -This recipe uses the following trtllm container image based on pre release/0.5.1 commit. -You might need to build the images to reproduce the benchmark. +### Container Image +This recipe was tested with dynamo trtllm runtime container for ARM64 processors. -* dynamo trtllm runtime for arm64 -Below image is built based on commit [7fdf50fec2cae9112224f5cea26cef3dde78506f](https://github.com/ai-dynamo/dynamo/commit/7fdf50fec2cae9112224f5cea26cef3dde78506f) +**Pre-built Image:** ``` nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64 ``` -Steps to build the image: -```bash -git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo -git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f +### Building Your Own Image (Optional) + +If you need to build the container image yourself (for example, if you're using different hardware or want to customize the setup): + +1. **Clone the repository:** + ```bash + git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + ``` + +2. **Switch to the specific version:** + ```bash + git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f + ``` + +3. **Build the container:** + ```bash + ./container/build.sh --framework TRTLLM --target runtime + ``` + +## Notes +1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup. -./container/build.sh --framework TRTLLM --target runtime -``` \ No newline at end of file +2. storage class is not specified in the recipe, you need to specify it in the `deploy.yaml` file. \ No newline at end of file From 123a9fc25496a889e2c52364d91ff50aae5149f0 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 17:52:25 -0700 Subject: [PATCH 09/14] fix --- recipes/gpt-oss-120b/README.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md index e88ed0a640af..b4351fe47f38 100644 --- a/recipes/gpt-oss-120b/README.md +++ b/recipes/gpt-oss-120b/README.md @@ -2,17 +2,34 @@ This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup. +## Prerequisites + +follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token. + ## Quick Start To run the model, simply execute this command in your terminal: ```bash +cd recipe ./run.sh --model gpt-oss-120b --framework trtllm agg ``` -## System Requirements +## (Alternative) Step by Step Guide + +### 1. Download the Model -### Model Download +```bash +cd recipes/gpt-oss-120b +kubectl apply -n $NAMESPACE -f ./model-cache +``` + +### 2. Deploy and Benchmark the Model + +```bash +cd recipes/gpt-oss-120b +kubectl apply -n $NAMESPACE -f ./trtllm/agg +``` ### Container Image This recipe was tested with dynamo trtllm runtime container for ARM64 processors. From d05e7ca45de16d8f0110c983aa3165aede8fa458 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 18:05:46 -0700 Subject: [PATCH 10/14] fix --- recipes/gpt-oss-120b/README.md | 23 +-------------------- recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 4 ++-- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md index b4351fe47f38..4e93dc5755e3 100644 --- a/recipes/gpt-oss-120b/README.md +++ b/recipes/gpt-oss-120b/README.md @@ -34,31 +34,10 @@ kubectl apply -n $NAMESPACE -f ./trtllm/agg ### Container Image This recipe was tested with dynamo trtllm runtime container for ARM64 processors. -**Pre-built Image:** ``` -nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64 +nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 ``` -### Building Your Own Image (Optional) - -If you need to build the container image yourself (for example, if you're using different hardware or want to customize the setup): - -1. **Clone the repository:** - ```bash - git clone https://github.com/ai-dynamo/dynamo.git - cd dynamo - ``` - -2. **Switch to the specific version:** - ```bash - git checkout 7fdf50fec2cae9112224f5cea26cef3dde78506f - ``` - -3. **Build the container:** - ```bash - ./container/build.sh --framework TRTLLM --target runtime - ``` - ## Notes 1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup. diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml index 4020c6a71cf1..c19a275eebf2 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml @@ -27,7 +27,7 @@ spec: command: - /bin/sh - -c - image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64 + image: my-registry/trtllm-runtime:my-tag pvc: create: false mountPoint: /model-store @@ -61,7 +61,7 @@ spec: command: - /bin/sh - -c - image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:7fdf50fec2cae9112224f5cea26cef3dde78506f-35606896-trtllm-arm64 + image: my-registry/trtllm-runtime:my-tag env: - name: TRTLLM_ENABLE_PDL value: "1" From 11554a9d3bc92c8c28f0b272cda87627895600fc Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 19:28:57 -0700 Subject: [PATCH 11/14] fix --- recipes/gpt-oss-120b/README.md | 8 ++++++++ recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md index 4e93dc5755e3..5ba4b8f49859 100644 --- a/recipes/gpt-oss-120b/README.md +++ b/recipes/gpt-oss-120b/README.md @@ -34,10 +34,18 @@ kubectl apply -n $NAMESPACE -f ./trtllm/agg ### Container Image This recipe was tested with dynamo trtllm runtime container for ARM64 processors. +**Important Note:** + +Before dynamo v0.5.1 release, following container image is supported: ``` nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 ``` +After dynamo v0.5.1 release, following container image will be supported: +``` +nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1 +``` + ## Notes 1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup. diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml index c19a275eebf2..608725b1f526 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml @@ -27,7 +27,7 @@ spec: command: - /bin/sh - -c - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 pvc: create: false mountPoint: /model-store @@ -61,7 +61,7 @@ spec: command: - /bin/sh - -c - image: my-registry/trtllm-runtime:my-tag + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 env: - name: TRTLLM_ENABLE_PDL value: "1" From c737358297f77edf1ff782c6164e1ff234e836e0 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 19:30:51 -0700 Subject: [PATCH 12/14] fix --- recipes/gpt-oss-120b/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md index 5ba4b8f49859..36ab361503fd 100644 --- a/recipes/gpt-oss-120b/README.md +++ b/recipes/gpt-oss-120b/README.md @@ -4,7 +4,7 @@ This guide will help you run the GPT-OSS-120B language model using Dynamo's opti ## Prerequisites -follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token. +Follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token. ## Quick Start From 9e0e434de3a79cb7bed3d726c06ddbf4b4f184e1 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 29 Sep 2025 20:00:06 -0700 Subject: [PATCH 13/14] fix --- recipes/gpt-oss-120b/trtllm/agg/{bench.yaml => perf.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename recipes/gpt-oss-120b/trtllm/agg/{bench.yaml => perf.yaml} (100%) diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml similarity index 100% rename from recipes/gpt-oss-120b/trtllm/agg/bench.yaml rename to recipes/gpt-oss-120b/trtllm/agg/perf.yaml From 4cfa0c59c760d528370195643db4cc71edc0d110 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Thu, 2 Oct 2025 12:51:02 -0700 Subject: [PATCH 14/14] update --- recipes/gpt-oss-120b/trtllm/agg/deploy.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml index fb5ba0bf2801..6f725af31088 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml @@ -99,6 +99,4 @@ spec: limits: gpu: "4" requests: - gpu: "4" - sharedMemory: - size: 80Gi \ No newline at end of file + gpu: "4" \ No newline at end of file