Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
26e540d
feat: add DeepSeek-V4-Flash FP4 B300 SGLang benchmark
cquil11 Apr 24, 2026
efdc8ba
fix: switch dsv4-fp4-b300-sglang to Pro + Max-Throughput recipe
cquil11 Apr 24, 2026
cc35a12
chore: sync launch_b200-dgxc-slurm.sh cache mount from claude/add-dsv…
cquil11 Apr 24, 2026
404a097
fix: restore trailing whitespace stripped from glm5.1 changelog entry
cquil11 Apr 24, 2026
97a488e
chore: add flock-guarded squash import to B300 runner
cquil11 Apr 24, 2026
106deea
fix: drop ENROOT_CACHE_PATH override from B300 runner
cquil11 Apr 24, 2026
4bb1f1a
chore: point B300 runner at shared gharunners/{squash,hf-hub-cache}
cquil11 Apr 24, 2026
744c5a0
fix: move enroot import out of srun to avoid pyxis namespace collision
cquil11 Apr 24, 2026
d003c59
fix: wipe stale pyxis scratch dirs for this JOB_ID before benchmark srun
cquil11 Apr 24, 2026
f00629f
Revert: drop all B300 runner changes, mirror #1128's approach
cquil11 Apr 24, 2026
570b0eb
runner: add head-node flock-guarded squash import on B300
cquil11 Apr 24, 2026
864419d
fix: mount at /ix and clear baked-in CUDA_VISIBLE_DEVICES
cquil11 Apr 24, 2026
5d93913
Merge branch 'main' into chore/dsv4-sgl-b300
cquil11 Apr 24, 2026
9453676
runner: use /data/models pre-staged path for dsv4 on B300
cquil11 Apr 24, 2026
5db43b8
fix: switch B300 dsv4 sglang to bw-ultra-compiled image
cquil11 Apr 24, 2026
c060c58
fix: switch B300 dsv4 sglang image to yhyang201/sglang-b300:v3
cquil11 Apr 24, 2026
08edf26
update b300
cquil11 Apr 24, 2026
a699ca0
feat(dsv4-fp4-b300-sglang): pick recipe by CONC; split search-space
cquil11 Apr 24, 2026
d35696c
update b300
cquil11 Apr 24, 2026
c3b562c
feat(dsv4-fp4-b300-sglang): low-latency recipe at every CONC (fallback)
cquil11 Apr 24, 2026
410df74
fix: align perf-changelog and config comments with low-latency fallback
github-actions[bot] Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1796,6 +1796,36 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }

# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists B200 (not B300) as the Blackwell target. This config reuses the
# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
# while the DeepEP FP8 weight-postprocess path is broken for this
# checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
# integers. raised from sglang.srt.layers.quantization.fp8
# .process_weights_after_loading_block_quant). Full concurrency sweep
# retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300
# once sglang can load the checkpoint under --moe-a2a-backend deepep.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }

qwen3.5-bf16-b200-sglang:
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
model: Qwen/Qwen3.5-397B-A17B
Expand Down
103 changes: 103 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b300.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
# `hf download`. Only fetch when MODEL looks like a HF repo ID.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi

nvidia-smi

export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0

# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its
# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half
# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks.
unset CUDA_VISIBLE_DEVICES

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for these images so the
# editable install stays visible. Paths in this script are $PWD-relative for
# that reason. Drop the runner conditional once lmsys moves sglang back out of
# /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
# Restore the CONC-based low-latency / balanced / max-throughput dispatch
# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
# --moe-a2a-backend deepep.
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
echo "Recipe: $RECIPE (CONC=$CONC)"

set -x
PYTHONNOUSERSITE=1 sglang serve \
--model-path $MODEL \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
--tp $TP \
--disable-radix-cache \
"${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1745,3 +1745,14 @@
- "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading"
- "Configs: 1k1k conc 4-64, 8k1k conc 4-64"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark"
- "Image: lmsysorg/sglang:deepseek-v4-blackwell"
- "Model: deepseek-ai/DeepSeek-V4-Pro (FP4 MoE experts + FP8 attention/dense)"
- "Reuses the B200 Pro Max-Throughput recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships"
- "DP=8 + DeepEP, prefix caching disabled, no speculative decoding"
- "Parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges (4-1024 for 1k1k, 4-512 for 8k1k) mirror dsv4-fp4-b200-vllm"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The perf-changelog entry added at lines 1749-1758 (and the outer NOTE comment at nvidia-master.yaml:1799-1803) describes the max-throughput config from the sibling PR #1132, not the low-latency-only fallback this PR actually adds: every field is wrong — image is deepseek-v4-blackwell vs the actual deepseek-v4-b300, it claims DP=8 + DeepEP and TP=8/EP=8/dp-attn=true with concurrency 4-1024/4-512 vs the actual tp:8, ep:1 with conc 4-32 and no DP-attn/DeepEP, and it references a nonexistent config dsv4-fp4-b200-vllm with pr-link pointing at #1132. Both blocks look like they were inherited from the parent branch and should be rewritten to describe the low-latency-only fallback (or the changelog entry deferred until #1132 lands); the in-block comment at nvidia-master.yaml:1812-1817 already has the correct description and directly contradicts the stale outer NOTE.

Extended reasoning...

What the bug is

This PR adds a new dsv4-fp4-b300-sglang config to .github/configs/nvidia-master.yaml and a matching entry to perf-changelog.yaml. The PR title and description make clear that it is a low-latency-only fallback — it strips the balanced and max-throughput rows because --moe-a2a-backend deepep is broken on this image/checkpoint. But the new changelog entry (lines 1749-1758) and the outer NOTE comment in the yaml (lines 1799-1803) both describe the opposite: the balanced/max-throughput recipe that the sibling PR #1132 will add once DeepEP is fixed.

Field-by-field comparison

Changelog entry at perf-changelog.yaml:1749-1758 vs the actual yaml that this PR adds:

Field Changelog says Actual yaml says
Image lmsysorg/sglang:deepseek-v4-blackwell lmsysorg/sglang:deepseek-v4-b300 (line 1805)
Recipe "B200 Pro Max-Throughput recipe" with "DP=8 + DeepEP" "Low-latency only (TP-only, no DP-attn, no DeepEP)" (line 1812)
Parallelism TP=8/EP=8/dp-attn=true { tp: 8, ep: 1 }, no dp-attn (lines 1821-1826)
Concurrency 4-1024 (1k1k), 4-512 (8k1k) conc-start: 4, conc-end: 32 for both
Mirror reference dsv4-fp4-b200-vllm No such key exists in nvidia-master.yaml (only dsv4-fp8-h200-vllm)
pr-link pull/1132 This PR is #1143#1132 is explicitly called the follow-up in the description

Step-by-step proof

  1. Read the diff for .github/configs/nvidia-master.yaml. The new config has exactly one search-space tuple per ISL: { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }. No dp-attention, no --moe-a2a-backend deepep, no EP>1.
  2. The in-yaml comment at lines 1812-1817 explicitly reads: "Low-latency only (TP-only, no DP-attn, no DeepEP) while the DeepEP FP8 weight-postprocess path is broken for this checkpoint on B300."
  3. Now read the outer NOTE at lines 1799-1803 of the same diff: "B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300... Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm." These two comment blocks describe mutually exclusive recipes for the same config entry.
  4. Read the PR description: "Strips the balanced and max-throughput rows from the dsv4-fp4-b300-sglang search-space so only the low-latency (TP-only) recipe runs" and "Re-introduce the balanced and max-throughput rows on [NVIDIA] chore: B300 single node DeepSeek v4 SGLang #1132 once the FP8+DeepEP weight-postprocess issue is fixed upstream." The outer NOTE and the changelog entry describe the [NVIDIA] chore: B300 single node DeepSeek v4 SGLang #1132 version, not this PR.
  5. A grep over .github/configs/nvidia-master.yaml for dsv4-fp4-b200-vllm returns only the stale comment — the referenced config does not exist.
  6. The changelog entry's pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 points at the wrong PR: this is [NVIDIA] chore: B300 single node DeepSeek v4 SGLang LOW LATENCY ONLY #1143, and per the description [NVIDIA] chore: B300 single node DeepSeek v4 SGLang #1132 is the follow-up.

Why existing code doesn't prevent it

perf-changelog.yaml and the yaml comments are free-form documentation — nothing validates that the changelog description matches the config it's documenting. The entry appears to have been inherited from the parent branch (chore/dsv4-sgl-b300#1132) and the search-space was rewritten for the fallback without updating the surrounding comments or the changelog.

Impact

After this PR merges, perf-changelog.yaml — the authoritative record of what changed in each PR — will state that PR #1132 added a DP=8/DeepEP/EP=8 max-throughput recipe with concurrency up to 1024. In reality, #1132 has not merged, and PR #1143 added a TP=8/EP=1 low-latency recipe capped at concurrency 32. Future readers diffing historical performance numbers against the changelog will be materially misled about the recipe in effect. Inside the yaml, the two contradictory comment blocks make it hard to tell which one reflects the actual intent.

How to fix

Rewrite both blocks to describe what this PR actually does. Minimal fix:

  • Update the perf-changelog.yaml entry to: image lmsysorg/sglang:deepseek-v4-b300, "Low-latency only (TP=8, EP=1, no DP-attn, no DeepEP) — fallback while the FP8 + DeepEP weight-postprocess path is broken on this checkpoint", concurrency 4-32 for both ISLs, drop the dsv4-fp4-b200-vllm reference, and set pr-link to pull/1143.
  • Either delete the outer NOTE comment at nvidia-master.yaml:1799-1803 (the in-block comment at 1812-1817 already explains the situation correctly) or rewrite it to match.
  • Alternatively, defer the changelog entry and add it in [NVIDIA] chore: B300 single node DeepSeek v4 SGLang #1132 when the full max-throughput recipe actually lands.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@claude please fix this

@Klaud-Cold Klaud-Cold Apr 24, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude finished @cquil11's task in 1m 28s —— View job


Fixing stale perf-changelog and config comments

  • Read current perf-changelog.yaml and nvidia-master.yaml to identify exact content
  • Fix perf-changelog.yaml entry to describe the low-latency fallback (correct image, parallelism, concurrency, pr-link)
  • Fix stale NOTE comment in nvidia-master.yaml
  • Commit and push changes

Changes (410df74)

perf-changelog.yaml — rewrote the dsv4-fp4-b300-sglang entry:

  • Image: deepseek-v4-blackwelldeepseek-v4-b300
  • Description: max-throughput DP=8/DeepEP/EP=8 → low-latency TP=8/EP=1, no DP-attn, no DeepEP
  • Concurrency: updated to match actual config (1k1k 4-1024, 8k1k 4-512)
  • Removed stale dsv4-fp4-b200-vllm mirror reference
  • pr-link: #1132#1143

.github/configs/nvidia-master.yaml — replaced the outer NOTE (lines 1799-1803) that described the B200 max-throughput recipe with a note matching the actual low-latency fallback. The in-block TODO comment at line 1812 was already correct and is unchanged.

5 changes: 2 additions & 3 deletions runners/launch_b200-dgxc-slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,7 @@ EOF

else

HF_HUB_CACHE_MOUNT="/scratch/fsw/models"
export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
Expand All @@ -276,7 +275,7 @@ else

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL,PORT=8888 \
Expand Down
48 changes: 38 additions & 10 deletions runners/launch_b300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -248,29 +248,57 @@ find . -name '.nfs*' -delete 2>/dev/null || true

else

HF_HUB_CACHE_MOUNT="/scratch/models"
# Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster,
# so point MODEL at the local copy. Other models fall through and use `hf download`
# against the mounted cache from their benchmark script.
# Pre-staged models on the B300 cluster live under /data/models. Point MODEL
# at the local copy so the benchmark skips `hf download` and reads from the
# mounted dir. Other models fall through and use `hf download` from their
# benchmark script.
HF_HUB_CACHE_MOUNT="/data/models"
if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
export MODEL="/scratch/models/${MODEL#*/}"
export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then
export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
fi
SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell
# and its B300-recompiled forks like yhyang201/sglang-b300) install sglang
# editable at /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang),
# so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install
# and breaks `import sglang`. Mount these images at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *deepseek-v4-b300* || "$IMAGE" == *sglang-b300* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

# Import the squash file on the head node (outside any srun) under flock.
# Parallel GH jobs target the same shared squash path; flock serializes
# imports so only one job pulls and writes the file while the rest wait.
(
exec 9>"$LOCK_FILE"
flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; }
if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then
echo "Squash file already exists and is valid, skipping import"
else
rm -f "$SQUASH_FILE"
enroot import -o "$SQUASH_FILE" "docker://$IMAGE"
fi
)

# Pin to one of the known-good B300 nodes; others have hardware/network
# issues that cause benchmarks to hang or fail to start.
salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)

srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--no-container-mount-home \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888 \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh

Expand Down
Loading