Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
name: "b200-fp4-low-latency-dep4-1p-tep8-5d"

model:
path: "dsfp4"
container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp4"

resources:
gpu_type: "b200"
prefill_nodes: 1
prefill_workers: 1
gpus_per_prefill: 4
decode_nodes: 5
decode_workers: 5
gpus_per_node: 8

backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
DYN_REQUEST_PLANE: nats

decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
DYN_REQUEST_PLANE: nats

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
trust-remote-code: true
quantization: "modelopt_fp4"

# Disaggregation mode
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.85
max-prefill-tokens: 32768
chunked-prefill-size: 32768
context-length: 2200
max-running-requests: 512
disable-cuda-graph: true

# Parallelism
tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4
enable-dp-attention: true
enable-dp-lm-head: true

# Attention
attention-backend: "trtllm_mla"
kv-cache-dtype: "fp8_e4m3"

# MoE
moe-runner-backend: "flashinfer_trtllm"
moe-dense-tp-size: 1

# Other flags
stream-interval: 30
watchdog-timeout: 1000000
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
trust-remote-code: true
quantization: "modelopt_fp4"

# Disaggregation mode
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.85
max-prefill-tokens: 32768
chunked-prefill-size: 32768
context-length: 2200
max-running-requests: 512
cuda-graph-max-bs: 512

# Parallelism
tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 8

# Attention
attention-backend: "trtllm_mla"
kv-cache-dtype: "fp8_e4m3"

# MoE
moe-runner-backend: "flashinfer_trtllm"
# moe-dense-tp-size: 1

# Other flags
stream-interval: 30
watchdog-timeout: 1000000
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "16x128x512"
req_rate: "inf"
139 changes: 139 additions & 0 deletions recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
name: "b200-fp4-low-latency-dep4-1p-tep8-6d"

model:
path: "dsfp4"
container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp4"

resources:
gpu_type: "b200"
prefill_nodes: 1
prefill_workers: 1
gpus_per_prefill: 4
decode_nodes: 6
decode_workers: 6
gpus_per_node: 8

backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
DYN_REQUEST_PLANE: nats

decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
DYN_REQUEST_PLANE: nats

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
trust-remote-code: true
quantization: "modelopt_fp4"

# Disaggregation mode
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.85
max-prefill-tokens: 32768
chunked-prefill-size: 32768
context-length: 2200
max-running-requests: 512
disable-cuda-graph: true
Comment thread
elvischenv marked this conversation as resolved.

# Parallelism
tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4
enable-dp-attention: true
enable-dp-lm-head: true

# Attention
attention-backend: "trtllm_mla"
kv-cache-dtype: "fp8_e4m3"

# MoE
moe-runner-backend: "flashinfer_trtllm"
moe-dense-tp-size: 1

# Other flags
stream-interval: 30
watchdog-timeout: 1000000
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
trust-remote-code: true
quantization: "modelopt_fp4"

# Disaggregation mode
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.85
max-prefill-tokens: 32768
chunked-prefill-size: 32768
context-length: 2200
max-running-requests: 512
cuda-graph-max-bs: 512

# Parallelism
tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 8

# Attention
attention-backend: "trtllm_mla"
kv-cache-dtype: "fp8_e4m3"

# MoE
moe-runner-backend: "flashinfer_trtllm"
# moe-dense-tp-size: 1

# Other flags
stream-interval: 30
watchdog-timeout: 1000000
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "32x64x256x512"
req_rate: "inf"
Loading
Loading