Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2647,11 +2647,13 @@ dsv4-fp8-h200-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
- { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
- { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256 }

# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
Expand All @@ -2669,11 +2671,13 @@ dsv4-fp8-h200-vllm-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }

# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper
Expand Down
20 changes: 15 additions & 5 deletions benchmarks/single_node/dsv4_fp8_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
Expand Down Expand Up @@ -37,20 +38,29 @@ else
MAX_MODEL_LEN_ARG="--max-model-len 800000"
fi

# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
# DP_ATTENTION=false runs pure tensor parallel.
PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
fi

EP_ARGS=()
if [ "${EP_SIZE:-1}" -gt 1 ]; then
EP_ARGS=(--enable-expert-parallel)
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP
# from the search space is used only for GPU allocation by the runner and
# as the DP size.
set -x
vllm serve $MODEL --host 0.0.0.0 --port $PORT \
--trust-remote-code \
--kv-cache-dtype fp8 \
--block-size 256 \
--no-enable-prefix-caching \
--enable-expert-parallel \
--data-parallel-size $TP \
"${PARALLEL_ARGS[@]}" \
"${EP_ARGS[@]}" \
$MAX_MODEL_LEN_ARG \
--gpu-memory-utilization 0.95 \
--max-num-seqs 512 \
Expand Down
20 changes: 15 additions & 5 deletions benchmarks/single_node/dsv4_fp8_h200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
Expand Down Expand Up @@ -45,20 +46,29 @@ else
MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN"
fi

# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
# DP_ATTENTION=false runs pure tensor parallel.
PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
fi

EP_ARGS=()
if [ "${EP_SIZE:-1}" -gt 1 ]; then
EP_ARGS=(--enable-expert-parallel)
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP
# from the search space is used only for GPU allocation by the runner and
# as the DP size.
set -x
vllm serve $MODEL --host 0.0.0.0 --port $PORT \
--trust-remote-code \
--kv-cache-dtype fp8 \
--block-size 256 \
--no-enable-prefix-caching \
--enable-expert-parallel \
--data-parallel-size $TP \
"${PARALLEL_ARGS[@]}" \
"${EP_ARGS[@]}" \
$MAX_MODEL_LEN_ARG \
--gpu-memory-utilization 0.95 \
--max-num-seqs 512 \
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2239,3 +2239,11 @@
- "Mirror the B300 TRT launch path with OpenAI chat serving, FP8 KV cache, TRTLLM MoE, NCCL NVLS disabled by default, and fused MHC disabled for hidden size 7168 correctness"
- "Update the B200 DGXC Slurm partition from removed gpu to gpu-2 so single-node B200 jobs can allocate"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1277

- config-keys:
- dsv4-fp8-h200-vllm
- dsv4-fp8-h200-vllm-mtp
description:
- "Add pure TP (tp:8, ep:1) search-space row alongside the existing DP+EP row"
- "Raise conc-end from 64 to 256 on both 1k1k and 8k1k sweeps"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1287
Loading