diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4d5b582b2..ea2feec37 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2647,11 +2647,13 @@ dsv4-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 } # MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds @@ -2669,11 +2671,13 @@ dsv4-fp8-h200-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp } # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). # Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh index 167a50a57..ba709ac4b 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -9,6 +9,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -37,20 +38,29 @@ else MAX_MODEL_LEN_ARG="--max-model-len 800000" fi +# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP); +# DP_ATTENTION=false runs pure tensor parallel. +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP -# from the search space is used only for GPU allocation by the runner and -# as the DP size. set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ --kv-cache-dtype fp8 \ --block-size 256 \ --no-enable-prefix-caching \ ---enable-expert-parallel \ ---data-parallel-size $TP \ +"${PARALLEL_ARGS[@]}" \ +"${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index ef0a0a8ad..b90b3d944 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -11,6 +11,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -45,20 +46,29 @@ else MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN" fi +# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP); +# DP_ATTENTION=false runs pure tensor parallel. +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP -# from the search space is used only for GPU allocation by the runner and -# as the DP size. set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ --kv-cache-dtype fp8 \ --block-size 256 \ --no-enable-prefix-caching \ ---enable-expert-parallel \ ---data-parallel-size $TP \ +"${PARALLEL_ARGS[@]}" \ +"${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9075a54a7..4334ecdcd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2239,3 +2239,11 @@ - "Mirror the B300 TRT launch path with OpenAI chat serving, FP8 KV cache, TRTLLM MoE, NCCL NVLS disabled by default, and fused MHC disabled for hidden size 7168 correctness" - "Update the B200 DGXC Slurm partition from removed gpu to gpu-2 so single-node B200 jobs can allocate" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1277 + +- config-keys: + - dsv4-fp8-h200-vllm + - dsv4-fp8-h200-vllm-mtp + description: + - "Add pure TP (tp:8, ep:1) search-space row alongside the existing DP+EP row" + - "Raise conc-end from 64 to 256 on both 1k1k and 8k1k sweeps" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1287