Skip to content
228 changes: 165 additions & 63 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2862,9 +2862,9 @@ dsr1-fp8-gb200-dynamo-trt:


dsr1-fp8-gb200-dynamo-sglang:
image: lmsysorg/sglang:v0.5.5.post2
image: lmsysorg/sglang:v0.5.8-cu130
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
model-prefix: dsr1-fp8
Comment thread
jgangani marked this conversation as resolved.
Outdated
runner: gb200
precision: fp8
framework: dynamo-sglang
Expand All @@ -2874,114 +2874,216 @@ dsr1-fp8-gb200-dynamo-sglang:
- isl: 1024
osl: 1024
search-space:
# "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 4096 ]
# "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
- conc-list: [4, 8]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml
- "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48)
- conc-list: [1024, 2048, 4096]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml
- "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml"
decode:
num-worker: 1
tp: 48
ep: 48
dp-attn: true

# "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [1024, 2048, 4096, 6144]
prefill:
num-worker: 2
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml
- "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml"
decode:
num-worker: 1
tp: 1
tp: 32
ep: 32
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8)
- conc-list: [4, 8, 16]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=8"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml
- "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false

# "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4)
- spec-decoding: "none"
conc-list: [ 2, 4, 8, 16, 64, 128 ]
# "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [512, 1024, 2048, 6144]
prefill:
num-worker: 5
tp: 8
ep: 8
dp-attn: true
additional-settings:
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml
- "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml"
decode:
num-worker: 1
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
ep: 4
tp: 32
ep: 32
dp-attn: true

# "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
- conc-list: [2048, 4096, 6144]
prefill:
num-worker: 6
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-low-latency"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml
- "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml"
decode:
num-worker: 4
tp: 1
ep: 4
num-worker: 1
tp: 24
ep: 24
dp-attn: true

dsr1-fp8-gb300-dynamo-sglang:
image: lmsysorg/sglang:v0.5.8-cu130
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1-fp8
runner: gb300
precision: fp8
framework: dynamo-sglang
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4)
- conc-list: [4, 8, 16, 32]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=4"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml
- "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml"
decode:
num-worker: 4
tp: 4
ep: 1
dp-attn: false

# "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48)
- spec-decoding: "none"
conc-list: [ 1024, 2048, 4096 ]
# "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [1024, 2048, 4096, 6144]
prefill:
num-worker: 3
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=6"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml
- "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml"
decode:
num-worker: 1
tp: 1
ep: 48
tp: 32
ep: 32
dp-attn: true

# "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8)
- conc-list: [4096, 7168, 7680]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=12"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml
- "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
- spec-decoding: "none"
conc-list: [ 4, 8, 16, 32 ]
# "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
- conc-list: [4, 8]
prefill:
num-worker: 1
tp: 1
ep: 4
dp-attn: true
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-low-latency"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml
- "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml"
decode:
num-worker: 1
tp: 1
ep: 4
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
tp: 4
ep: 1
dp-attn: false

# Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 512, 1024, 2048, 6144 ]
# "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
- conc-list: [128, 256, 512, 1024]
prefill:
num-worker: 5
tp: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml
- "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml"
decode:
num-worker: 1
tp: 1
tp: 32
ep: 32
dp-attn: true

# "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
- conc-list: [2048, 4096]
prefill:
num-worker: 6
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=8"
# https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml
- "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true

dsr1-fp4-gb200-dynamo-sglang:
image: "lmsysorg/sglang:v0.5.5.post2"
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -361,3 +361,13 @@
- "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads"
- "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32"
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/617

- config-keys:
- dsr1-fp8-gb200-dynamo-sglang
- dsr1-fp8-gb300-dynamo-sglang
description:
- "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
- "Image: lmsysorg/sglang:v0.5.8-cu130"
- "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/635

46 changes: 2 additions & 44 deletions runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
if [[ $MODEL_PREFIX == "dsr1" ]]; then
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
elif [[ $MODEL_PREFIX == "dsr1-fp8" ]]; then
export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/"
Comment thread
jgangani marked this conversation as resolved.
Outdated
else
export MODEL_PATH=$MODEL
fi
Expand Down Expand Up @@ -48,50 +50,6 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX
export ISL="$ISL"
export OSL="$OSL"

if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
export IMAGE=$SQUASH_FILE
export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}.sh"
# Wait for all jobs to complete
echo "Waiting for all jobs to complete..."
while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
echo "Jobs still running..."
squeue --steps -u $USER
sleep 30
done

# Find the latest log directory that contains the data
cat > collect_latest_results.py <<'PY'
import os, sys
sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
for path in sorted([f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
print(path)
PY

LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL 1)
if [ -z "$LOGS_DIR" ]; then
echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
exit 1
fi

echo "Found logs directory: $LOGS_DIR"
ls -la $LOGS_DIR

# Result JSON are contained within the result directory
for result_file in $(find $LOGS_DIR -type f); do
# result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json
file_name=$(basename $result_file)
if [ -f $result_file ]; then
# Copy the result file to workspace with a unique name
WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}"
cp $result_file $WORKSPACE_RESULT_FILE
fi
done

exit 0
fi

echo "Cloning srt-slurm repository..."
SRT_REPO_DIR="srt-slurm"
if [ -d "$SRT_REPO_DIR" ]; then
Expand Down
Loading