From c3d0789735f740d838d9aebbe3349093d174b90b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 5 May 2026 13:39:02 -0700 Subject: [PATCH 1/3] dsv4-fp8-h200-vllm: switch from DP+EP to pure TP Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp8_h200.sh | 6 +----- benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh index 167a50a57..2fc2aa9f8 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -40,17 +40,13 @@ fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP -# from the search space is used only for GPU allocation by the runner and -# as the DP size. set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ --kv-cache-dtype fp8 \ --block-size 256 \ --no-enable-prefix-caching \ ---enable-expert-parallel \ ---data-parallel-size $TP \ +--tensor-parallel-size $TP \ $MAX_MODEL_LEN_ARG \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index ef0a0a8ad..fcfadce16 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -48,17 +48,13 @@ fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP -# from the search space is used only for GPU allocation by the runner and -# as the DP size. set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ --kv-cache-dtype fp8 \ --block-size 256 \ --no-enable-prefix-caching \ ---enable-expert-parallel \ ---data-parallel-size $TP \ +--tensor-parallel-size $TP \ $MAX_MODEL_LEN_ARG \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ From 06bf7c4bfe54d244f406a5540d8cdf6a138bf9c3 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 5 May 2026 14:29:23 -0700 Subject: [PATCH 2/3] dsv4-fp8-h200-vllm: parameterize TP vs DP+EP, sweep both up to conc 256 Re-introduces the DP-attention + expert-parallel path on top of pure TP by reading DP_ATTENTION/EP_SIZE env vars (matching dsv4_fp4_b200_vllm.sh). nvidia-master.yaml now sweeps both {tp:8,ep:1,dp-attn:false} and {tp:8,ep:8,dp-attn:true}, with conc-end raised from 64 to 256. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 12 ++++++++---- benchmarks/single_node/dsv4_fp8_h200.sh | 16 +++++++++++++++- benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 16 +++++++++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 042d9a5f8..20e222a8b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2624,11 +2624,13 @@ dsv4-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 } # MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds @@ -2646,11 +2648,13 @@ dsv4-fp8-h200-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp } # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). # Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh index 2fc2aa9f8..ba709ac4b 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -9,6 +9,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -37,6 +38,18 @@ else MAX_MODEL_LEN_ARG="--max-model-len 800000" fi +# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP); +# DP_ATTENTION=false runs pure tensor parallel. +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -46,7 +59,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --kv-cache-dtype fp8 \ --block-size 256 \ --no-enable-prefix-caching \ ---tensor-parallel-size $TP \ +"${PARALLEL_ARGS[@]}" \ +"${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index fcfadce16..b90b3d944 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -11,6 +11,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -45,6 +46,18 @@ else MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN" fi +# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP); +# DP_ATTENTION=false runs pure tensor parallel. +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -54,7 +67,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --kv-cache-dtype fp8 \ --block-size 256 \ --no-enable-prefix-caching \ ---tensor-parallel-size $TP \ +"${PARALLEL_ARGS[@]}" \ +"${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ From 701ea13d6bb9f7b7c23f32464d687ed71cc1fcc8 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 5 May 2026 16:26:57 -0700 Subject: [PATCH 3/3] update --- .github/configs/nvidia-master.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 20e222a8b..81e984fe1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2624,13 +2624,13 @@ dsv4-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 } # MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds @@ -2648,13 +2648,13 @@ dsv4-fp8-h200-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp } # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). # Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper