diff --git a/recipes/gpt-oss-120b/README.md b/recipes/gpt-oss-120b/README.md new file mode 100644 index 000000000000..36ab361503fd --- /dev/null +++ b/recipes/gpt-oss-120b/README.md @@ -0,0 +1,52 @@ +# GPT-OSS-120B Recipe Guide + +This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup. + +## Prerequisites + +Follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token. + +## Quick Start + +To run the model, simply execute this command in your terminal: + +```bash +cd recipe +./run.sh --model gpt-oss-120b --framework trtllm agg +``` + +## (Alternative) Step by Step Guide + +### 1. Download the Model + +```bash +cd recipes/gpt-oss-120b +kubectl apply -n $NAMESPACE -f ./model-cache +``` + +### 2. Deploy and Benchmark the Model + +```bash +cd recipes/gpt-oss-120b +kubectl apply -n $NAMESPACE -f ./trtllm/agg +``` + +### Container Image +This recipe was tested with dynamo trtllm runtime container for ARM64 processors. + +**Important Note:** + +Before dynamo v0.5.1 release, following container image is supported: +``` +nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 +``` + +After dynamo v0.5.1 release, following container image will be supported: +``` +nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1 +``` + +## Notes +1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup. + +2. storage class is not specified in the recipe, you need to specify it in the `deploy.yaml` file. \ No newline at end of file diff --git a/recipes/gpt-oss-120b/model-cache/model-cache.yaml b/recipes/gpt-oss-120b/model-cache/model-cache.yaml new file mode 100644 index 000000000000..790c46955e75 --- /dev/null +++ b/recipes/gpt-oss-120b/model-cache/model-cache.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-cache +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 100Gi + storageClassName: "your-storage-class-name" \ No newline at end of file diff --git a/recipes/gpt-oss-120b/model-cache/model-download.yaml b/recipes/gpt-oss-120b/model-cache/model-download.yaml new file mode 100644 index 000000000000..f6554cf83886 --- /dev/null +++ b/recipes/gpt-oss-120b/model-cache/model-download.yaml @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: batch/v1 +kind: Job +metadata: + name: model-download +spec: + backoffLimit: 3 + completions: 1 + parallelism: 1 + template: + metadata: + labels: + app: model-download + spec: + restartPolicy: Never + containers: + - name: model-download + image: python:3.10-slim + command: ["sh", "-c"] + envFrom: + - secretRef: + name: hf-token-secret + env: + - name: MODEL_NAME + value: openai/gpt-oss-120b + - name: HF_HOME + value: /model-store + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: MODEL_REVISION + value: b5c939de8f754692c1647ca79fbf85e8c1e70f8a + args: + - | + set -eux + pip install --no-cache-dir huggingface_hub hf_transfer + hf download $MODEL_NAME --revision $MODEL_REVISION --exclude "original/*" --exclude "metal/*" + volumeMounts: + - name: model-cache + mountPath: /model-store + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: model-cache \ No newline at end of file diff --git a/recipes/gpt-oss-120b/trtllm/agg/config.yaml b/recipes/gpt-oss-120b/trtllm/agg/config.yaml index 5f50bb24b98f..2d1701bc3bdc 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/config.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/config.yaml @@ -6,20 +6,12 @@ metadata: name: llm-config data: config.yaml: | - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 enable_attention_dp: true - build_config: - max_batch_size: 640 - max_num_tokens: 20000 - moe_config: - backend: CUTLASS cuda_graph_config: - max_batch_size: 640 + max_batch_size: 800 enable_padding: true kv_cache_config: - free_gpu_memory_fraction: 0.9 enable_block_reuse: false - print_iter_log: false - stream_interval: 50 - use_torch_sampler: true \ No newline at end of file + stream_interval: 20 + moe_config: + backend: CUTLASS \ No newline at end of file diff --git a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml index 7dd176f3ecb6..d3c8479f1b12 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml @@ -3,61 +3,91 @@ apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: - name: gpt-oss-agg-shm + name: gpt-oss-agg spec: backendFramework: trtllm services: + Frontend: + componentType: frontend + dynamoNamespace: gpt-oss-agg + extraPodSpec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: nvidia.com/dynamo-graph-deployment-name + operator: In + values: + - gpt-oss-agg-frontend + topologyKey: kubernetes.io/hostname + mainContainer: + args: + - python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 + command: + - /bin/sh + - -c + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1 + pvc: + create: false + mountPoint: /model-store + name: model-cache + replicas: 1 TrtllmWorker: componentType: main - dynamoNamespace: gpt-oss-agg-shm + dynamoNamespace: gpt-oss-agg envFromSecret: hf-token-secret - pvc: - create: false - name: model-cache-oss-gpt120b - mountPoint: /root/.cache/huggingface sharedMemory: size: 80Gi extraPodSpec: - tolerations: - - key: "dedicated" - operator: "Equal" - value: "user-workload" - effect: "NoSchedule" - - key: "dedicated" - operator: "Equal" - value: "user-workload" - effect: "NoExecute" affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: nvidia.com/gpu.present - operator: In - values: - - "true" + - matchExpressions: + - key: nvidia.com/gpu.present + operator: In + values: + - "true" mainContainer: args: - | - export TRTLLM_ENABLE_PDL=1 - export TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True - export ENGINE_ARGS=${AGG_ENGINE_ARGS:-"/root/.cache/huggingface/gpt-oss-120b/config.yaml"} - export MODEL_PATH=${MODEL_PATH:-"/root/.cache/huggingface/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"} - export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} - trap 'echo Cleaning up...; kill 0' EXIT - python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & python3 -m dynamo.trtllm \ - --model-path "$MODEL_PATH" \ - --served-model-name "$SERVED_MODEL_NAME" \ - --extra-engine-args "$ENGINE_ARGS" \ - --max-num-tokens 20000 \ - --max-batch-size 640 \ + --model-path "${MODEL_PATH}" \ + --served-model-name "openai/gpt-oss-120b" \ + --extra-engine-args "${ENGINE_ARGS}" \ + --tensor-parallel-size 4 \ + --expert-parallel-size 4 \ + --max-batch-size 800 \ --free-gpu-memory-fraction 0.9 command: - /bin/sh - -c - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.1 + env: + - name: TRTLLM_ENABLE_PDL + value: "1" + - name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL + value: "True" + - name: SERVED_MODEL_NAME + value: "openai/gpt-oss-120b" + - name: ENGINE_ARGS + value: "/opt/dynamo/configs/config.yaml" + - name: MODEL_PATH + value: "/model-store/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a" + volumeMounts: + - mountPath: /opt/dynamo/configs + name: llm-config + readOnly: true + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1 workingDir: /workspace/components/backends/trtllm + volumes: + - configMap: + name: llm-config + name: llm-config + pvc: + create: false + mountPoint: /model-store + name: model-cache replicas: 1 resources: limits: diff --git a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml similarity index 51% rename from recipes/gpt-oss-120b/trtllm/agg/bench.yaml rename to recipes/gpt-oss-120b/trtllm/agg/perf.yaml index 76a1fd45126e..eed5d69addbf 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/bench.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml @@ -11,47 +11,35 @@ spec: template: metadata: labels: - app: oss-gpt120b + app: oss-gpt120b-bench spec: - restartPolicy: Never + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: nvidia.com/dynamo-graph-deployment-name + operator: In + values: + - gpt-oss-agg + topologyKey: kubernetes.io/hostname containers: - - name: perf - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.1 - workingDir: /workspace/components/backends/vllm - env: - - name: TARGET_MODEL - value: openai/gpt-oss-120b - - name: ENDPOINT - value: gpt-oss-agg-trtllmworker:8000 - - name: CONCURRENCIES - value: "13000 13500 1400" - - name: ISL - value: "16" - - name: OSL - value: "1000" - - name: DEPLOYMENT_MODE - value: "agg" - - name: DEPLOYMENT_GPU_COUNT - value: "32" - - name: JOB_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['job-name'] - - name: ROOT_ARTIFACT_DIR - value: /root/.cache/huggingface/hub/perf - command: + - command: - /bin/sh - -c - | - #TODO: this can be baked into the aiperf image - apt-get update && apt-get install -y curl jq + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 EPOCH=$(date +%s) ## utility functions -- can be moved to a bash script / configmap wait_for_model_ready() { echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting 5s..." + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" sleep 5 done echo "✅ Model '$TARGET_MODEL' is now available!" @@ -65,11 +53,11 @@ spec: key=concurrency_${concurrency} export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ - --tokenizer ~/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ - --endpoint-type chat \ - --endpoint /v1/chat/completions \ + --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ + --endpoint-type chat --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ @@ -80,13 +68,14 @@ spec: --extra-inputs "{\"min_tokens\":$osl}" \ --extra-inputs "{\"ignore_eos\":true}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --extra-inputs "{\"repetition_penalty\":1.0}" \ + --extra-inputs "{\"temperature\": 0.0}" \ --concurrency $concurrency \ - --request-count $((3*concurrency)) \ + --request-count $((10*concurrency)) \ --warmup-request-count $concurrency \ - --conversation-num 1 \ + --conversation-num 12800 \ --random-seed 100 \ - --request-rate 100000 \ - --workers-max 128 \ + --workers-max 252 \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream'\ --record-processors 32 \ @@ -97,10 +86,15 @@ spec: #### Actual execution #### wait_for_model_ready mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" # Write input_config.json cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <