SemiAnalysisAI · kedarpotdar-nv · Feb 9, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -2862,9 +2862,9 @@ dsr1-fp8-gb200-dynamo-trt:
 
 
 dsr1-fp8-gb200-dynamo-sglang:
-  image: lmsysorg/sglang:v0.5.5.post2
+  image: lmsysorg/sglang:v0.5.8-cu130
   model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
+  model-prefix: dsr1-fp8
   runner: gb200
   precision: fp8
   framework: dynamo-sglang
@@ -2874,114 +2874,216 @@ dsr1-fp8-gb200-dynamo-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    # "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32)
-    - spec-decoding: "none"
-      conc-list: [ 4096 ]
+   # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
+    - conc-list: [4, 8]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+
+    # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48)
+    - conc-list: [1024, 2048, 4096]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml"
+      decode:
+        num-worker: 1
+        tp: 48
+        ep: 48
+        dp-attn: true
+
+    # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [1024, 2048, 4096, 6144]
       prefill:
         num-worker: 2
-        # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
-        # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=4"
-        - "N_ADDITIONAL_FRONTENDS=9"
-        - "SCRIPT_MODE=1k1k-max-tpt"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 32
         ep: 32
         dp-attn: true
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+   # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8)
+    - conc-list: [4, 8, 16]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "DECODE_NODES=8"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
 
-    # "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4)
-    - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 64, 128 ]
+    # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [512, 1024, 2048, 6144]
       prefill:
+        num-worker: 5
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml"
+      decode:
         num-worker: 1
-        # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
-        # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
-        tp: 1
-        ep: 4
+        tp: 32
+        ep: 32
+        dp-attn: true
+
+    # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
+    - conc-list: [2048, 4096, 6144]
+      prefill:
+        num-worker: 6
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
-        - "N_ADDITIONAL_FRONTENDS=9"
-        - "SCRIPT_MODE=1k1k-low-latency"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml
+        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml"
       decode:
-        num-worker: 4
-        tp: 1
-        ep: 4
+        num-worker: 1
+        tp: 24
+        ep: 24
         dp-attn: true
+
+dsr1-fp8-gb300-dynamo-sglang:
+  image: lmsysorg/sglang:v0.5.8-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1-fp8
+  runner: gb300
+  precision: fp8
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+   # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4)
+    - conc-list: [4, 8, 16, 32]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "DECODE_NODES=4"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml"
+      decode:
+        num-worker: 4
+        tp: 4
+        ep: 1
+        dp-attn: false
 
-    # "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48)
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
+    # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [1024, 2048, 4096, 6144]
       prefill:
-        num-worker: 3
-        # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
-        # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
-        tp: 1
+        num-worker: 2
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=6"
-        - "N_ADDITIONAL_FRONTENDS=9"
-        - "SCRIPT_MODE=1k1k-max-tpt"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 48
+        tp: 32
+        ep: 32
+        dp-attn: true
+
+    # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8)
+    - conc-list: [4096, 7168, 7680]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=12"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
 
   - isl: 8192
     osl: 1024
     search-space:
-    # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
-    - spec-decoding: "none"
-      conc-list: [ 4, 8, 16, 32 ]
+   # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
+    - conc-list: [4, 8]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 4
-        dp-attn: true
+        tp: 4
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "PREFILL_NODES=1"
-        - "N_ADDITIONAL_FRONTENDS=8"
-        - "SCRIPT_MODE=8k1k-low-latency"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
+        tp: 4
+        ep: 1
+        dp-attn: false
 
-    # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048, 6144 ]
+    # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
+    - conc-list: [128, 256, 512, 1024]
       prefill:
         num-worker: 5
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=10"
-        - "N_ADDITIONAL_FRONTENDS=8"
-        - "SCRIPT_MODE=8k1k-max-tpt"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 32
         ep: 32
         dp-attn: true
+
+    # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
+    - conc-list: [2048, 4096]
+      prefill:
+        num-worker: 6
+        tp: 8
+        ep: 8
+        dp-attn: true
         additional-settings:
-        - "DECODE_NODES=8"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml
+        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml"
+      decode:
+        num-worker: 1
+        tp: 24
+        ep: 24
+        dp-attn: true
 
 dsr1-fp4-gb200-dynamo-sglang:
   image: "lmsysorg/sglang:v0.5.5.post2"

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -361,3 +361,13 @@
     - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads"
     - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/617
+
+- config-keys:
+    - dsr1-fp8-gb200-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-sglang
+  description:
+    - "Update  GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/635
+
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
@@ -11,6 +11,8 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
     export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
     if [[ $MODEL_PREFIX == "dsr1" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
+    elif [[ $MODEL_PREFIX == "dsr1-fp8" ]]; then
+        export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/"
     else
         export MODEL_PATH=$MODEL
     fi
@@ -48,50 +50,6 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX
 export ISL="$ISL"
 export OSL="$OSL"
 
-if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
-    export IMAGE=$SQUASH_FILE
-    export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
-    bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}.sh"
-    # Wait for all jobs to complete
-    echo "Waiting for all jobs to complete..."
-    while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
-        echo "Jobs still running..."
-        squeue --steps -u $USER
-        sleep 30
-    done
-
-        # Find the latest log directory that contains the data
-    cat > collect_latest_results.py <<'PY'
-import os, sys
-sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
-for path in sorted([f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
-    print(path)
-PY
-
-    LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL 1)
-    if [ -z "$LOGS_DIR" ]; then
-        echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
-        exit 1
-    fi
-
-    echo "Found logs directory: $LOGS_DIR"
-    ls -la $LOGS_DIR
-
-    # Result JSON are contained within the result directory
-    for result_file in $(find $LOGS_DIR -type f); do
-        # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json
-        file_name=$(basename $result_file)
-        if [ -f $result_file ]; then
-            # Copy the result file to workspace with a unique name
-            WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
-            echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}"
-            cp $result_file $WORKSPACE_RESULT_FILE
-        fi
-    done
-
-    exit 0
-fi
-
 echo "Cloning srt-slurm repository..."
 SRT_REPO_DIR="srt-slurm"
 if [ -d "$SRT_REPO_DIR" ]; then