Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
109 changes: 109 additions & 0 deletions recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
name: "h100-fp8-1p1d-max-dep-mtp"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130"
precision: "fp8"

resources:
gpu_type: "h100"
prefill_nodes: 2
prefill_workers: 1
decode_nodes: 2
decode_workers: 1
gpus_per_node: 8

backend:

# Prefill-specific environment variables
prefill_environment:
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"

# Decode-specific environment variables
decode_environment:
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 16
dp-size: 1
ep-size: 1
enable-dp-attention: false

# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Prefill capacity
max-running-requests: 4

# Prefill-specific mode
disaggregation-bootstrap-port: 30001
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.6
max-prefill-tokens: 2048
chunked-prefill-size: 2048

# Request handling
load-balance-method: "round_robin"

# MTP (Multi-Token Prediction)
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 16
dp-size: 16
ep-size: 16
enable-dp-attention: true

# KV cache and attention
attention-backend: "flashinfer"

# Other flags
disable-radix-cache: true
stream-interval: 1

# Disagg
disaggregation-bootstrap-port: 30001
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.85
max-running-requests: 64
cuda-graph-max-bs: 64

# MTP
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x2x4x8x16x32x64"
req_rate: "inf"
111 changes: 111 additions & 0 deletions recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
name: "h100-fp8-1p2d-max-tp-mtp"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130"
precision: "fp8"

resources:
gpu_type: "h100"
prefill_nodes: 2
prefill_workers: 1
decode_nodes: 4
decode_workers: 2
gpus_per_node: 8

backend:

# Prefill-specific environment variables
prefill_environment:
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_ENABLE_SPEC_V2: "1"

# Decode-specific environment variables
decode_environment:
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_ENABLE_SPEC_V2: "1"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 16
dp-size: 1
ep-size: 1
enable-dp-attention: false

# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Other flags
max-running-requests: 2

# Prefill-specific mode
disaggregation-bootstrap-port: 30001
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.6
max-prefill-tokens: 2048
chunked-prefill-size: 2048

# Request handling
load-balance-method: "round_robin"

# MTP (Multi-Token Prediction)
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 16
dp-size: 1
ep-size: 1
enable-dp-attention: false

# KV cache and attention
attention-backend: "flashinfer"

# Other flags
disable-radix-cache: true
stream-interval: 1

# Disagg
disaggregation-bootstrap-port: 30001
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.9
max-running-requests: 128
cuda-graph-max-bs: 128

# MTP
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x2x4x8x16x32x64x128"
req_rate: "inf"
97 changes: 97 additions & 0 deletions recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
name: "h100-fp8-1p1d-max-dep"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130"
precision: "fp8"

resources:
gpu_type: "h100"
prefill_nodes: 2
prefill_workers: 1
decode_nodes: 2
decode_workers: 1
gpus_per_node: 8

backend:

# Prefill-specific environment variables
prefill_environment:
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"

# Decode-specific environment variables
decode_environment:
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 16
dp-size: 1
ep-size: 1
enable-dp-attention: false

# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Prefill capacity
max-running-requests: 4

# Prefill-specific mode
disaggregation-bootstrap-port: 30001
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.6
max-prefill-tokens: 2048
chunked-prefill-size: 2048

# Request handling
load-balance-method: "round_robin"

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 16
dp-size: 16
ep-size: 16
enable-dp-attention: true

# KV cache and attention
attention-backend: "flashinfer"

# Other flags
disable-radix-cache: true
stream-interval: 1

# Disagg
disaggregation-bootstrap-port: 30001
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.9
max-running-requests: 64
cuda-graph-max-bs: 64

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x2x4x8x16x32x64"
req_rate: "inf"
Loading