Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions recipes/h200/1k1k/bs128-agg-tp-mtp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: "agg-tp-h200-fp8-mtp"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h200"
agg_nodes: 1
agg_workers: 1
gpus_per_node: 8

backend:

# Aggregated environment variables
aggregated_environment:
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

sglang_config:
aggregated:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true
watchdog-timeout: 1000000

# Parallelism
tp-size: 8
dp-size: 1

# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Other flags
stream-interval: 10
max-running-requests: 128 # sum of all dp

# Memory and token limits
mem-fraction-static: 0.75
max-prefill-tokens: 32768
chunked-prefill-size: 32768

# CUDA graphs
cuda-graph-max-bs: 128

# MTP settings
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x4x16x32x64x128x256x512"
req_rate: "inf"
115 changes: 115 additions & 0 deletions recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: "bs256-1p6d-h200-fp8-mtp"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h200"
prefill_nodes: 1
prefill_workers: 1
decode_nodes: 6
decode_workers: 6
gpus_per_node: 8

backend:

# Prefill-specific environment variables
prefill_environment:
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

# Decode-specific environment variables
decode_environment:
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true
watchdog-timeout: 1000000

# Parallelism
tp-size: 8
dp-size: 8
ep-size: 8
enable-dp-attention: true
# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Other flags
# stream-interval: 50
max-running-requests: 512


# Prefill-specific mode
disaggregation-bootstrap-port: 30001
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.75
max-prefill-tokens: 65536
chunked-prefill-size: 262144

# Request handling
load-balance-method: "round_robin"


decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true
watchdog-timeout: 1000000

# Parallelism
tp-size: 8
dp-size: 8
ep-size: 8
enable-dp-attention: true

# KV cache and attention
attention-backend: "flashinfer"

# Other flags
disable-radix-cache: true
stream-interval: 10

# Disagg
disaggregation-bootstrap-port: 30001
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.75
max-running-requests: 128
cuda-graph-max-bs: 128

# MTP settings
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "128x256x512x1024x2048"
req_rate: "inf"
115 changes: 115 additions & 0 deletions recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: "bs256-1p6d-h200-fp8-mtp"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h200"
prefill_nodes: 1
prefill_workers: 1
decode_nodes: 6
decode_workers: 6
gpus_per_node: 8

backend:

# Prefill-specific environment variables
prefill_environment:
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

# Decode-specific environment variables
decode_environment:
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true
watchdog-timeout: 1000000

# Parallelism
tp-size: 8
dp-size: 1
ep-size: 1

# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Other flags
# stream-interval: 50
max-running-requests: 512


# Prefill-specific mode
disaggregation-bootstrap-port: 30001
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.7
max-prefill-tokens: 163840
chunked-prefill-size: 163840

# Request handling
load-balance-method: "round_robin"


decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true
watchdog-timeout: 1000000

# Parallelism
tp-size: 8
dp-size: 1
ep-size: 1

# KV cache and attention
attention-backend: "flashinfer"

# Other flags
disable-radix-cache: true
stream-interval: 10

# Disagg
disaggregation-bootstrap-port: 30001
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.75
max-running-requests: 128
cuda-graph-max-bs: 128

# MTP settings
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
# concurrencies: "128x256x512"
concurrencies: "512x1024x2048"
req_rate: "inf"
Comment on lines +109 to +115
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

cd /tmp && find . -name "bs256-1p6d-tp-mtp.yaml" -o -name "*.yaml" | head -20

Repository: ishandhanani/srt-slurm

Length of output: 48


🏁 Script executed:

# First, let me search for the file in the repository
find . -name "bs256-1p6d-tp-mtp.yaml" 2>/dev/null

Repository: ishandhanani/srt-slurm

Length of output: 108


🏁 Script executed:

# Let's explore the directory structure to find the YAML file
fd "bs256-1p6d-tp-mtp.yaml"

Repository: ishandhanani/srt-slurm

Length of output: 106


🏁 Script executed:

# Search for the file pattern
fd "\.yaml$" | grep -E "(h200|recipes)"

Repository: ishandhanani/srt-slurm

Length of output: 24678


🏁 Script executed:

cat -n recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml

Repository: ishandhanani/srt-slurm

Length of output: 3184


Benchmark concurrencies exceed max-running-requests (128).
Line 106 targets 512/1024/2048 while the decode configuration caps at 128 (line 92–93), causing the server to queue requests and preventing the benchmark from fully testing the intended concurrency levels. Raise max-running-requests/cuda-graph-max-bs to support higher concurrencies, or revert to the 128/256/512 set to match current limits.

🔧 Option: align concurrencies with current runtime caps
-  # concurrencies: "128x256x512"
-  concurrencies: "512x1024x2048"
+  concurrencies: "128x256x512"
🤖 Prompt for AI Agents
In `@recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml` around lines 101 - 107, The
benchmark concurrencies ("concurrencies" field set to "512x1024x2048") exceed
the runtime decode caps; either increase the decode limits by raising
max-running-requests and cuda-graph-max-bs to at least 2048 (so they match the
highest concurrency) or revert the "concurrencies" value back to a supported set
like "128x256x512"; update the fields named max-running-requests and
cuda-graph-max-bs (or the decode configuration block) to the new numeric limits
if you choose to raise limits, or change the benchmark.concurrencies string to
the lower values if you choose to align with current caps.

Loading