SemiAnalysisAI · Oseltamivir · Apr 26, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
@@ -2477,17 +2477,15 @@
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 4 }
     - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 128, conc-end: 128 }
-    - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
+    - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 4096 }
+    - { tp: 8, dp-attn: true, conc-start: 2048, conc-end: 8192 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 4 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 128, conc-end: 128 }
-    - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 4, dp-attn: true, conc-start: 128, conc-end: 1024 }
+    - { tp: 8, dp-attn: true, conc-start: 1024, conc-end: 8192 }
 
 qwen3.5-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.9-cu129-amd64

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -38,6 +38,13 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
 fi
 
+# DP mode: mbt=ISL; TP mode: mbt=2*ISL; floor at 2048
+if [ "${DP_ATTENTION}" = "true" ]; then
+    MAX_NUM_BATCHED_TOKENS=$(( ISL < 2048 ? 2048 : ISL ))
+else
+    MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 < 2048 ? 2048 : ISL * 2 ))
+fi
+
 BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
 if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then
     BENCHMARK_MAX_MODEL_LEN=4096
@@ -71,7 +78,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --reasoning-parser deepseek_v4 \
     --max-cudagraph-capture-size 2048 \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
-    --max-num-batched-tokens 2048 > "$SERVER_LOG" 2>&1 &
+    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1819,3 +1819,12 @@
     - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
+
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - "Update search space based on B300 pareto sweep results"
+    - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
+    - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
+