diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 80be8101e..97665ca53 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2706,6 +2706,29 @@ dsv4-fp4-b300-vllm: - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } +dsv4-fp4-b300-trt: + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-4999884 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: trt + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + dsv4-fp4-b300-vllm-mtp: image: vllm/vllm-openai:v0.20.0-cu130 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index fd57c8436..4b7966df0 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -164,12 +164,13 @@ jobs: clean: true submodules: true - - name: Cleanup stale eval outputs (pre-run) - if: ${{ inputs.run-eval || inputs.eval-only }} + - name: Cleanup stale outputs (pre-run) run: | rm -f meta_env.json || true rm -f results*.json || true rm -f sample*.jsonl || true + rm -f server.log || true + rm -f gpu_metrics.csv || true - name: Launch job script env: diff --git a/AGENTS.md b/AGENTS.md index c5a72fe77..2f15cbfaa 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -495,6 +495,7 @@ Markers available: `slow`, `integration` ## Important Notes 1. Make sure no new directories are created in `/workspace` during the benchmark. Files are ok. +2. **Never delete or modify whitespace in `perf-changelog.yaml`** — the CI pipeline depends on the exact whitespace (including trailing spaces on blank separator lines). Removing or altering whitespace will break CI and cause pipeline crashes. ## Fetching GitHub Actions Benchmark Results diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 4c0c8642e..0cb8fdcd0 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -165,11 +165,12 @@ wait_for_server_ready() { } # Run benchmark serving with standardized parameters -# All parameters are required except --use-chat-template, --dsv4, and --trust-remote-code +# All parameters are required except --endpoint, --use-chat-template, --dsv4, and --trust-remote-code # Parameters: # --model: Model name # --port: Server port # --backend: Backend type - e.g., 'vllm' or 'openai' +# --endpoint: Optional API endpoint override # --input-len: Random input sequence length # --output-len: Random output sequence length # --random-range-ratio: Random range ratio @@ -194,6 +195,7 @@ run_benchmark_serving() { local model="" local port="" local backend="" + local endpoint="" local input_len="" local output_len="" local random_range_ratio="" @@ -221,6 +223,10 @@ run_benchmark_serving() { backend="$2" shift 2 ;; + --endpoint) + endpoint="$2" + shift 2 + ;; --input-len) input_len="$2" shift 2 @@ -356,6 +362,10 @@ run_benchmark_serving() { --result-dir "$result_dir" --result-filename "$result_filename.json" ) + + if [[ -n "$endpoint" ]]; then + benchmark_cmd+=(--endpoint "$endpoint") + fi # Add --use-chat-template if requested if [[ "$use_chat_template" == true ]]; then diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh new file mode 100644 index 000000000..1356ecbac --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash + +# DeepSeek-V4-Pro single-node TRTLLM recipe for B300. The configured image +# already contains NVIDIA/TensorRT-LLM@feat/deepseek_v4; do not build TRTLLM at +# runtime from this benchmark path. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + DP_ATTENTION \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}" +export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}" + +sanitize_slurm_mpi_env_for_trtllm() { + if [[ "${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-0}" != "1" ]]; then + return 0 + fi + + echo "Sanitizing Slurm/PMI environment for TensorRT-LLM launch" + while IFS='=' read -r name _; do + case "$name" in + SLURM_*|PMIX*|PMI*|OMPI_*|ORTE_*) + unset "$name" + ;; + esac + done < <(env) +} + +sanitize_slurm_mpi_env_for_trtllm + +export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" +echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" + +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} +EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" + +MOE_BACKEND="TRTLLM" +MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) +CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" +KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" + +ATTENTION_DP_CONFIG="" +if [[ "$DP_ATTENTION" == "true" ]]; then + ATTENTION_DP_CONFIG=" +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60" +fi + +cat > "$EXTRA_CONFIG_FILE" << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE +enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG +print_iter_log: true +kv_cache_config: + tokens_per_block: 128 + dtype: fp8 + free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION + enable_block_reuse: false +stream_interval: 10 +num_postprocess_workers: 4 +moe_config: + backend: $MOE_BACKEND +EOF + +echo "Generated config file contents:" +cat "$EXTRA_CONFIG_FILE" + +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( ISL + OSL + 256 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + +# DeepSeek-V4-Pro has hidden size 7168. The current TRTLLM fused-HC MHC +# path corrupts eval generations for this shape; keep eval servers on the +# unfused path until the fused kernel is guarded or supports 7168. +export TRTLLM_MHC_ENABLE_FUSED_HC=0 +echo "TRTLLM_MHC_ENABLE_FUSED_HC: $TRTLLM_MHC_ENABLE_FUSED_HC" + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +set -x +SERVE_CMD=( + trtllm-serve "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --trust_remote_code \ + --backend pytorch \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --max_seq_len "$MAX_MODEL_LEN" \ + --max_num_tokens "$MAX_NUM_TOKENS" \ + --tp_size "$TP" \ + --ep_size "$EP_SIZE" \ + --custom_tokenizer deepseek_v4 \ + --config "$EXTRA_CONFIG_FILE" +) + +if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then + "${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 & +else + mpirun -n 1 --oversubscribe --allow-run-as-root \ + "${SERVE_CMD[@]}" \ + > "$SERVER_LOG" 2>&1 & +fi + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai-chat \ + --endpoint /v1/chat/completions \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$(( CONC * 10 ))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" \ + --trust-remote-code \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e25542834..4098a580a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2140,3 +2140,11 @@ - "Search-space: tp=8 and tp=4/ep=1 over conc 4-64, on both 1024/1024 and 8192/1024 ISL/OSL" - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1057 + +- config-keys: + - dsv4-fp4-b300-trt + description: + - "Add B300 TensorRT-LLM DeepSeek-V4-Pro eval coverage using the feat/deepseek_v4 image" + - "Disable TRTLLM fused MHC hyper-connection for eval servers via TRTLLM_MHC_ENABLE_FUSED_HC=0 because the current fused kernel corrupts DeepSeek-V4-Pro hidden size 7168 generations" + - "Keep this as eval-only PR validation until the TensorRT-LLM fused MHC kernel is guarded or supports hidden size 7168" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 5b4bac59d..c22a02df8 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -276,6 +276,7 @@ else LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" fi + LOCK_FILE="${SQUASH_FILE}.lock" # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell @@ -310,6 +311,7 @@ else JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID \ + --mpi=none \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ --no-container-mount-home \ diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index af030720e..7f4a93284 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -341,8 +341,9 @@ async def async_request_openai_chat_completions( async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: - content = [{"type": "text", "text": request_func_input.prompt}] + content = request_func_input.prompt if request_func_input.multi_modal_content: + content = [{"type": "text", "text": request_func_input.prompt}] content.append(request_func_input.multi_modal_content) payload = { "model": request_func_input.model_name \