From c3d0789735f740d838d9aebbe3349093d174b90b Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 5 May 2026 13:39:02 -0700
Subject: [PATCH 1/3] dsv4-fp8-h200-vllm: switch from DP+EP to pure TP

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp8_h200.sh     | 6 +-----
 benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh
index 167a50a57..2fc2aa9f8 100644
--- a/benchmarks/single_node/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200.sh
@@ -40,17 +40,13 @@ fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
-# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP
-# from the search space is used only for GPU allocation by the runner and
-# as the DP size.
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
 --kv-cache-dtype fp8 \
 --block-size 256 \
 --no-enable-prefix-caching \
---enable-expert-parallel \
---data-parallel-size $TP \
+--tensor-parallel-size $TP \
 $MAX_MODEL_LEN_ARG \
 --gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \
diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
index ef0a0a8ad..fcfadce16 100755
--- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
@@ -48,17 +48,13 @@ fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
-# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP
-# from the search space is used only for GPU allocation by the runner and
-# as the DP size.
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
 --kv-cache-dtype fp8 \
 --block-size 256 \
 --no-enable-prefix-caching \
---enable-expert-parallel \
---data-parallel-size $TP \
+--tensor-parallel-size $TP \
 $MAX_MODEL_LEN_ARG \
 --gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \

From 06bf7c4bfe54d244f406a5540d8cdf6a138bf9c3 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 5 May 2026 14:29:23 -0700
Subject: [PATCH 2/3] dsv4-fp8-h200-vllm: parameterize TP vs DP+EP, sweep both
 up to conc 256

Re-introduces the DP-attention + expert-parallel path on top of pure TP
by reading DP_ATTENTION/EP_SIZE env vars (matching dsv4_fp4_b200_vllm.sh).
nvidia-master.yaml now sweeps both {tp:8,ep:1,dp-attn:false} and
{tp:8,ep:8,dp-attn:true}, with conc-end raised from 64 to 256.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml          | 12 ++++++++----
 benchmarks/single_node/dsv4_fp8_h200.sh     | 16 +++++++++++++++-
 benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 16 +++++++++++++++-
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 042d9a5f8..20e222a8b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2624,11 +2624,13 @@ dsv4-fp8-h200-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 }
 
 # MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
@@ -2646,11 +2648,13 @@ dsv4-fp8-h200-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp }
 
 # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
 # Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper
diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh
index 2fc2aa9f8..ba709ac4b 100644
--- a/benchmarks/single_node/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200.sh
@@ -9,6 +9,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+    DP_ATTENTION \
     CONC \
     ISL \
     OSL \
@@ -37,6 +38,18 @@ else
     MAX_MODEL_LEN_ARG="--max-model-len 800000"
 fi
 
+# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
+# DP_ATTENTION=false runs pure tensor parallel.
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -46,7 +59,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --kv-cache-dtype fp8 \
 --block-size 256 \
 --no-enable-prefix-caching \
---tensor-parallel-size $TP \
+"${PARALLEL_ARGS[@]}" \
+"${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
 --gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \
diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
index fcfadce16..b90b3d944 100755
--- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
@@ -11,6 +11,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+    DP_ATTENTION \
     CONC \
     ISL \
     OSL \
@@ -45,6 +46,18 @@ else
     MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN"
 fi
 
+# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
+# DP_ATTENTION=false runs pure tensor parallel.
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -54,7 +67,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --kv-cache-dtype fp8 \
 --block-size 256 \
 --no-enable-prefix-caching \
---tensor-parallel-size $TP \
+"${PARALLEL_ARGS[@]}" \
+"${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
 --gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \

From 701ea13d6bb9f7b7c23f32464d687ed71cc1fcc8 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 5 May 2026 16:26:57 -0700
Subject: [PATCH 3/3] update

---
 .github/configs/nvidia-master.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 20e222a8b..81e984fe1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2624,13 +2624,13 @@ dsv4-fp8-h200-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 }
 
 # MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
@@ -2648,13 +2648,13 @@ dsv4-fp8-h200-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, dp-attn: false, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
 # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
 # Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper