Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions recipes/gb200-fp4/1k1k/1p2d-mtp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ backend:
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
data-parallel-size: 1
Expand Down Expand Up @@ -104,7 +103,6 @@ backend:
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
speculative-algorithm: "EAGLE"
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp4/1k1k/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ backend:
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
disaggregation-transfer-backend: nixl
Expand Down Expand Up @@ -104,7 +103,6 @@ backend:
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
disaggregation-transfer-backend: nixl
fp4-gemm-backend: "flashinfer_trtllm"
tensor-parallel-size: 4
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp4/1k8k/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ backend:
disable-cuda-graph: true
max-running-requests: 512
scheduler-recv-interval: 10
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
data-parallel-size: 1
Expand All @@ -101,7 +100,6 @@ backend:
chunked-prefill-size: 8192
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
moe-dense-tp-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
fp4-gemm-backend: "flashinfer_trtllm"
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp4/8k1k/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ backend:
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
data-parallel-size: 1
Expand Down Expand Up @@ -105,7 +104,6 @@ backend:
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
disaggregation-transfer-backend: nixl
fp4-gemm-backend: "flashinfer_trtllm"
tensor-parallel-size: 4
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp8/1k1k/low-latency-mtp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: true
enable-symm-mem: true
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand Down Expand Up @@ -111,7 +110,6 @@ backend:
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 8
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp8/1k1k/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ backend:
scheduler-recv-interval: 10
fp8-gemm-backend: "flashinfer_trtllm"
enable-symm-mem: true
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand All @@ -106,7 +105,6 @@ backend:
max-running-requests: 128
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 4
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp8/1k8k/low-latency-mtp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ backend:
load-balance-method: "round_robin"
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
disaggregation-bootstrap-port: 30001
tensor-parallel-size: 8
data-parallel-size: 1
Expand Down Expand Up @@ -109,7 +108,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false
enable-symm-mem: true
moe-dense-tp-size: 1
prefill-round-robin-balance: true
disaggregation-bootstrap-port: 30001
tensor-parallel-size: 8
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp8/1k8k/low_latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ backend:
enable-flashinfer-allreduce-fusion: false
fp8-gemm-backend: "flashinfer_trtllm"
enable-symm-mem: true
moe-dense-tp-size: 1
disaggregation-bootstrap-port: 30001
tensor-parallel-size: 8
data-parallel-size: 1
Expand All @@ -107,7 +106,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false
enable-symm-mem: false #true
moe-dense-tp-size: 1
disaggregation-bootstrap-port: 30001
prefill-round-robin-balance: true
tensor-parallel-size: 8
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp8/8k1k/low-latency-mtp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ backend:
load-balance-method: "round_robin"
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand Down Expand Up @@ -107,7 +106,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false
enable-symm-mem: true
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 8
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb200-fp8/8k1k/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ backend:
max-running-requests: 512
load-balance-method: "round_robin"
scheduler-recv-interval: 10
moe-dense-tp-size: 1
tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 1
Expand All @@ -102,7 +101,6 @@ backend:
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 8
data-parallel-size: 1
Expand Down
4 changes: 1 addition & 3 deletions recipes/gb300-fp4/128k8k/5-low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ backend:
expert-parallel-size: 4
kv-cache-dtype: fp8_e4m3
mem-fraction-static: 0.95
moe-dense-tp-size: 1
moe-runner-backend: flashinfer_trtllm
pipeline-parallel-size: 1
prefill-round-robin-balance: true
Expand All @@ -100,7 +99,6 @@ backend:
load-balance-method: round_robin
max-total-tokens: 544000
mem-fraction-static: 0.95
moe-dense-tp-size: 1
moe-runner-backend: flashinfer_trtllm
pipeline-parallel-size: 4
quantization: modelopt_fp4
Expand All @@ -116,4 +114,4 @@ benchmark:
isl: 128000
osl: 8000
req_rate: "inf"
type: "sa-bench"
type: "sa-bench"
2 changes: 0 additions & 2 deletions recipes/gb300-fp4/1k1k/low_latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ backend:
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
data-parallel-size: 1
Expand Down Expand Up @@ -104,7 +103,6 @@ backend:
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
fp4-gemm-backend: "flashinfer_trtllm"
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp4/1k8k/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ backend:
disable-cuda-graph: true
max-running-requests: 512
scheduler-recv-interval: 10
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
data-parallel-size: 1
Expand All @@ -101,7 +100,6 @@ backend:
chunked-prefill-size: 8192
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
moe-dense-tp-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
fp4-gemm-backend: "flashinfer_trtllm"
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp4/8k1k/low_latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ backend:
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
data-parallel-size: 1
Expand Down Expand Up @@ -106,7 +105,6 @@ backend:
cuda-graph-max-bs: 128
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
enable-dp-attention: false
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp8/1k1k/mtp/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand Down Expand Up @@ -117,7 +116,6 @@ backend:
scheduler-recv-interval: 1 # save mem
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 4
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp8/1k1k/stp/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand All @@ -109,7 +108,6 @@ backend:
scheduler-recv-interval: 1 # save mem
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 4
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp8/1k8k/mtp/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand Down Expand Up @@ -117,7 +116,6 @@ backend:
scheduler-recv-interval: 1 # save mem
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 4
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp8/1k8k/stp/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand All @@ -109,7 +108,6 @@ backend:
scheduler-recv-interval: 1 # save mem
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 4
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp8/8k1k/mtp/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand Down Expand Up @@ -117,7 +116,6 @@ backend:
scheduler-recv-interval: 1 # save mem
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 4
data-parallel-size: 1
Expand Down
2 changes: 0 additions & 2 deletions recipes/gb300-fp8/8k1k/stp/low-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ backend:
scheduler-recv-interval: 10
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1
Expand All @@ -109,7 +108,6 @@ backend:
scheduler-recv-interval: 1 # save mem
enable-flashinfer-allreduce-fusion: false # to save mem
enable-symm-mem: false # to save mem
moe-dense-tp-size: 1
prefill-round-robin-balance: true
tensor-parallel-size: 4
data-parallel-size: 1
Expand Down
Loading