SemiAnalysisAI · cquil11 · May 6, 2026 · May 5, 2026 · May 5, 2026
@@ -2647,11 +2647,13 @@ dsv4-fp8-h200-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 }
 
 # MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
@@ -2669,11 +2671,13 @@ dsv4-fp8-h200-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
 # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
 # Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper

diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh
@@ -9,6 +9,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+    DP_ATTENTION \
     CONC \
     ISL \
     OSL \
@@ -37,20 +38,29 @@ else
     MAX_MODEL_LEN_ARG="--max-model-len 800000"
 fi
 
+# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
+# DP_ATTENTION=false runs pure tensor parallel.
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
-# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP
-# from the search space is used only for GPU allocation by the runner and
-# as the DP size.
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
 --kv-cache-dtype fp8 \
 --block-size 256 \
 --no-enable-prefix-caching \
---enable-expert-parallel \
---data-parallel-size $TP \
+"${PARALLEL_ARGS[@]}" \
+"${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
 --gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \

diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
@@ -11,6 +11,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+    DP_ATTENTION \
     CONC \
     ISL \
     OSL \
@@ -45,20 +46,29 @@ else
     MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN"
 fi
 
+# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
+# DP_ATTENTION=false runs pure tensor parallel.
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
-# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP
-# from the search space is used only for GPU allocation by the runner and
-# as the DP size.
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
 --kv-cache-dtype fp8 \
 --block-size 256 \
 --no-enable-prefix-caching \
---enable-expert-parallel \
---data-parallel-size $TP \
+"${PARALLEL_ARGS[@]}" \
+"${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
 --gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2239,3 +2239,11 @@
     - "Mirror the B300 TRT launch path with OpenAI chat serving, FP8 KV cache, TRTLLM MoE, NCCL NVLS disabled by default, and fused MHC disabled for hidden size 7168 correctness"
     - "Update the B200 DGXC Slurm partition from removed gpu to gpu-2 so single-node B200 jobs can allocate"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1277
+
+- config-keys:
+    - dsv4-fp8-h200-vllm
+    - dsv4-fp8-h200-vllm-mtp
+  description:
+    - "Add pure TP (tp:8, ep:1) search-space row alongside the existing DP+EP row"
+    - "Raise conc-end from 64 to 256 on both 1k1k and 8k1k sweeps"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1287