Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a1a0325
bring all configs here
Elnifio Dec 9, 2025
c03076b
test for GB200 only
Elnifio Dec 9, 2025
028f224
updates the files and git clone urls
Elnifio Dec 9, 2025
25a19b1
update the prefill nodes
Elnifio Dec 10, 2025
124ddf4
update 1k1k fp4 config
Elnifio Dec 10, 2025
6199031
updates to run 1k1k fp4 only
Elnifio Dec 10, 2025
344ac6c
updates the FP4 8k1k
Elnifio Dec 10, 2025
355773a
update the model path
Elnifio Dec 10, 2025
0dd1e5a
restore changes to full sweeps
Elnifio Dec 10, 2025
7da0be5
updates the config for 1k1k fp4
Elnifio Dec 11, 2025
b38b633
temporarily disable some concurrencies
Elnifio Dec 11, 2025
8136816
updates the params
Elnifio Dec 12, 2025
c1f1be4
updates the branch
Elnifio Dec 12, 2025
7a8e890
update config
Elnifio Dec 15, 2025
ce40018
temporarily disable all other configs
Elnifio Dec 15, 2025
35c7eb3
Revert "temporarily disable all other configs"
Elnifio Dec 16, 2025
b26d699
update comments
Elnifio Dec 16, 2025
5b0509a
Merge branch 'main' into ishan/moreconfigs
cquil11 Dec 17, 2025
c1024db
bump the image for DSR1
Elnifio Dec 17, 2025
3d4c3ae
Merge branch 'main' into ishan/moreconfigs
yunzhoul-nv Dec 17, 2025
35d7555
update the model-path args
Elnifio Dec 17, 2025
45cc883
model-path not permitted
Elnifio Dec 17, 2025
a6cc157
switches the branch
Elnifio Dec 17, 2025
2731ccb
Merge branch 'main' into ishan/moreconfigs
cquil11 Dec 17, 2025
b3ccea8
add perf changelog
cquil11 Dec 17, 2025
00dcff7
used the wrong model path here...
Elnifio Dec 18, 2025
e845bdd
Merge branch 'main' into ishan/moreconfigs
cquil11 Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 178 additions & 6 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@ dsr1-fp8-gb200-dynamo-sglang:
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
decode:
num-worker: 1
tp: 1
Expand All @@ -819,7 +820,7 @@ dsr1-fp8-gb200-dynamo-sglang:
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1p_4d"
- "SCRIPT_MODE=1k1k-low-latency"
decode:
num-worker: 4
tp: 1
Expand All @@ -841,6 +842,7 @@ dsr1-fp8-gb200-dynamo-sglang:
additional-settings:
- "PREFILL_NODES=6"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
decode:
num-worker: 1
tp: 1
Expand All @@ -852,22 +854,192 @@ dsr1-fp8-gb200-dynamo-sglang:
- isl: 8192
osl: 1024
search-space:
# Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
- spec-decoding: "none"
conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ]
conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-low-latency"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=1"

# Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 512, 1024, 2048, 6144 ]
prefill:
num-worker: 5
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=8"

dsr1-fp4-gb200-dynamo-sglang:
# TODO: swap
image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1
# TODO: what is the right name?
model: deepseek-ai/DeepSeek-R1-0528-fp4-v2
model-prefix: dsr1
runner: gb200
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4)
- spec-decoding: "none"
conc-list: [ 4, 8, 32, 64 ]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=1k1k-low-latency"
decode:
num-worker: 2
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=2"

# Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48)
- spec-decoding: "none"
conc-list: [ 512, 1024, 2048, 4096, 8192 ]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=1k1k-middle-curve"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=12"

# Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 8192, 12000, 15000 ]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=1k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=8"
- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 4, 8, 32, 64 ]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-low-latency"
decode:
num-worker: 4
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=4"
- spec-decoding: "none"
conc-list: [ 512, 1024, 2048, 4096 ]
prefill:
num-worker: 6
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=6"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=8k1k-middle-curve"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=12"
- "DECODE_NODES=12"
- spec-decoding: "none"
conc-list: [ 1024, 2048, ]
prefill:
num-worker: 10
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=6"
- "DECODE_NODES=8"
- spec-decoding: "none"
conc-list: [ 8192 ]
prefill:
num-worker: 10
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=8"
38 changes: 38 additions & 0 deletions benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

#!/bin/bash

set -x

source "$(dirname "$0")/benchmark_lib.sh"

check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \
DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \
PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME

# Always clone and setup Dynamo
echo "Cloning Dynamo repository..."
git clone https://github.com/ai-dynamo/dynamo.git
cd dynamo && git checkout ishan/fp48k1k && cd .. # All configs are now tracked in this branch

cd "$SGL_SLURM_JOBS_PATH"

# Set up SGL launch script-specific environment variables
export TIME_LIMIT="04:00:00"
export MODEL_PATH=$MODEL_PATH
export CONFIG_DIR=$CONFIG_DIR
export CONTAINER_IMAGE=$IMAGE
export GPU_TYPE="gb200-fp4"

# Launch jobs based on ISL/OSL
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
# by a list of numbers delimted by 'x'. This is because of how the underlying launch script
# expects the concurrencies.
bash ./submit_disagg.sh $PREFILL_NODES \
$PREFILL_NUM_WORKERS \
$DECODE_NODES \
$DECODE_NUM_WORKERS \
$N_ADDITIONAL_FRONTENDS \
$ISL $OSL "${CONC_LIST// /x}" inf \
$GPU_TYPE \
$SCRIPT_MODE
11 changes: 5 additions & 6 deletions benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,8 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \

# Always clone and setup Dynamo
echo "Cloning Dynamo repository..."
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git
else
git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git
fi
git clone https://github.com/ai-dynamo/dynamo.git
cd dynamo && git checkout ishan/fp48k1k && cd .. # All configs are now tracked in this branch

cd "$SGL_SLURM_JOBS_PATH"

Expand All @@ -25,6 +22,7 @@ export TIME_LIMIT="04:00:00"
export MODEL_PATH=$MODEL_PATH
export CONFIG_DIR=$CONFIG_DIR
export CONTAINER_IMAGE=$IMAGE
export GPU_TYPE="gb200-fp8"

# Launch jobs based on ISL/OSL
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
Expand All @@ -36,4 +34,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \
$DECODE_NUM_WORKERS \
$N_ADDITIONAL_FRONTENDS \
$ISL $OSL "${CONC_LIST// /x}" inf \
$SCRIPT_MODE
$GPU_TYPE \
$SCRIPT_MODE
22 changes: 8 additions & 14 deletions runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,16 @@ export SLURM_JOB_NAME="benchmark-dynamo.job"

### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars
if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
# Set IMAGE based on ISL/OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh"
else
export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh"
fi
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the PR, is it possible to have the IMAGE inherit from the nvidia-master.yaml instead of hard setting in the launcher script?

kinda like what trtllm dynamo already does?
Image

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @functionstackx , thanks for the comment! I have updated the code in InferenceMAX/InferenceMAX@c1024db so that Dynamo+SGLang will also pull the container from nvidia-master.yaml.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


# FIXME: Another workaround for all the different branching
# THIS NEEDS TO BE STANDARDIZED ASAP
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
if [[ $PRECISION == "fp4" ]]; then
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for posterity, it would be preferable if this was retrieved from the master config
I.e., in the master config make the model field /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 and then add a brief comments explaining that on the GB200 cluster, we user pre-downloaded models as opposed to the standard convention (in InferenceMAX) of downloading with HF to the HF cache)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion! I have addressed them through InferenceMAX/InferenceMAX@35d7555 and InferenceMAX/InferenceMAX@45cc883, which applies to TRTLLM side of code as well.

else
export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs"
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

fi

export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
else
SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
Expand Down Expand Up @@ -148,4 +142,4 @@ PY
done
fi

echo "All result files processed"
echo "All result files processed"