Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ b200:
- 'b200-nvd_1'
- 'b200-nvd_2'
- 'b200-nvd_3'
- 'b200-dgxc_1'
- 'b200-dgxc_2'
mi300x:
- 'mi300x-amd_0'
- 'mi300x-amd_1'
Expand Down
78 changes: 78 additions & 0 deletions benchmarks/gptoss_fp4_b200_trt_docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env bash

# === Required Env Vars ===
# HF_TOKEN
# HF_HUB_CACHE
# IMAGE
# MODEL
# ISL
# OSL
# MAX_MODEL_LEN
# RANDOM_RANGE_RATIO
# TP
# CONC
# RESULT_FILENAME
# PORT

# GPTOSS TRTLLM Deployment Guide:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md

# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
EP_SIZE="1"
Comment thread
ankursingh-nv marked this conversation as resolved.
Outdated
MOE_BACKEND="TRTLLM"
DP_ATTENTION=false

# Higher concurrencies: Concurrency >= 256
# MoE Backend = CUTLASS
# Use DP attention with expert parallel MoE
if [[ $CONC -ge 256 ]]; then
Comment thread
ankursingh-nv marked this conversation as resolved.
Outdated
EP_SIZE="$TP"
DP_ATTENTION=true
fi

echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"

EXTRA_CONFIG_FILE="gptoss-fp4.yml"
export TRTLLM_ENABLE_PDL=1
export NCCL_GRAPH_REGISTER=0

cat > $EXTRA_CONFIG_FILE << EOF
cuda_graph_config:
enable_padding: true
max_batch_size: $CONC
enable_attention_dp: $DP_ATTENTION
kv_cache_config:
dtype: fp8
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
print_iter_log: true
stream_interval: 20
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
EOF

if [[ "$DP_ATTENTION" == "true" ]]; then
cat << EOF >> $EXTRA_CONFIG_FILE
attention_dp_config:
enable_balance: true
EOF
fi

echo "Generated config file contents:"
cat $EXTRA_CONFIG_FILE

set -x

MAX_NUM_TOKENS=20000

# Launch TRT-LLM server
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve $MODEL --port=$PORT \
--trust_remote_code \
--backend=pytorch \
--max_batch_size 512 \
--max_seq_len=$MAX_MODEL_LEN \
--max_num_tokens=$MAX_NUM_TOKENS \
--tp_size=$TP --ep_size=$EP_SIZE \
--extra_llm_api_options=$EXTRA_CONFIG_FILE
94 changes: 94 additions & 0 deletions runners/launch_b200-dgxc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/bash

HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
PORT=8888

# Create unique cache directory based on model parameters
MODEL_NAME=$(basename "$MODEL")

server_name="bmk-server"
client_name="bmk-client"

nvidia-smi

# GPUs must be idle
if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
echo "[ERROR] GPU busy from previous run"; nvidia-smi; exit 1
fi

set -x
# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup
# Ref: https://www.paolomainardi.com/posts/docker-run-init/

# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs.
# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes.
# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register


docker run --rm -d --init --network host --name $server_name \
--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
Comment thread
ankursingh-nv marked this conversation as resolved.
Outdated
-e NCCL_GRAPH_REGISTER=0 \
-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--entrypoint=/bin/bash \
$(echo "$IMAGE" | sed 's/#/\//') \
benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"

set +x
while IFS= read -r line; do
printf '%s\n' "$line"
if [[ "$line" =~ Application\ startup\ complete ]]; then
break
fi
done < <(docker logs -f --tail=0 $server_name 2>&1)

git clone https://github.com/kimbochen/bench_serving.git


if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
if [[ "$OSL" == "8192" ]]; then
NUM_PROMPTS=$(( CONC * 20 ))
else
NUM_PROMPTS=$(( CONC * 50 ))
fi
else
NUM_PROMPTS=$(( CONC * 10 ))
fi

set -x
docker run --rm --network host --name $client_name \
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
--entrypoint=/bin/bash \
$(echo "$IMAGE" | sed 's/#/\//') \
-lc "pip install -q datasets pandas && \
python3 bench_serving/benchmark_serving.py \
--model $MODEL --backend vllm --base-url http://localhost:$PORT \
--dataset-name random \
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
--num-prompts $NUM_PROMPTS \
--max-concurrency $CONC \
--request-rate inf --ignore-eos \
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir /workspace/ --result-filename $RESULT_FILENAME.json"
Comment thread
ankursingh-nv marked this conversation as resolved.
Outdated

# Try graceful first
docker stop -t 90 "$server_name" || true
# Wait until it's really dead
docker wait "$server_name" >/dev/null 2>&1 || true
# Force remove if anything lingers
docker rm -f "$server_name" >/dev/null 2>&1 || true

# Give a moment for GPU processes to fully terminate
sleep 2
# Verify GPUs are now idle; if not, print diag and (optionally) reset
if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
echo "[WARN] After stop, GPU still busy:"; nvidia-smi
# Last resort if driver allows and GPUs appear idle otherwise:
#nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true
fi

nvidia-smi
Loading