-
Notifications
You must be signed in to change notification settings - Fork 37
Add GB200 DSR1-FP4 1k/8k recipies #85
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,111 @@ | ||
| name: "gb200-fp4-1p2d" | ||
|
|
||
| model: | ||
| path: "dsfp4" | ||
| container: "lmsysorg/sglang:dev-cu13" | ||
| precision: "fp4" | ||
|
|
||
| resources: | ||
| gpu_type: "gb200" | ||
| prefill_nodes: 1 | ||
| decode_nodes: 2 | ||
| prefill_workers: 1 | ||
| decode_workers: 2 | ||
| gpus_per_node: 4 | ||
|
|
||
| backend: | ||
|
|
||
| prefill_environment: | ||
| TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" | ||
| PYTHONUNBUFFERED: "1" | ||
| DYN_SKIP_SGLANG_LOG_FORMATTING: "1" | ||
| SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" | ||
| SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" | ||
| SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" | ||
| SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" | ||
| SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" | ||
| SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" | ||
| #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" | ||
| #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" | ||
| MC_FORCE_MNNVL: "1" | ||
| NCCL_MNNVL_ENABLE: "1" | ||
| NCCL_CUMEM_ENABLE: "1" | ||
| SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" | ||
| SGLANG_ENABLE_JIT_DEEPGEMM: "false" | ||
| SGLANG_ENABLE_FLASHINFER_GEMM: "true" | ||
|
|
||
| decode_environment: | ||
| TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" | ||
| PYTHONUNBUFFERED: "1" | ||
| DYN_SKIP_SGLANG_LOG_FORMATTING: "1" | ||
| SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" | ||
| SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" | ||
| SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" | ||
| SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" | ||
| SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" | ||
| SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" | ||
| # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" | ||
| # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" | ||
| MC_FORCE_MNNVL: "1" | ||
| NCCL_MNNVL_ENABLE: "1" | ||
| NCCL_CUMEM_ENABLE: "1" | ||
| SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" | ||
| SGLANG_ENABLE_JIT_DEEPGEMM: "false" | ||
| SGLANG_ENABLE_FLASHINFER_GEMM: "true" | ||
|
|
||
| sglang_config: | ||
| prefill: | ||
| disaggregation-mode: "prefill" | ||
| served-model-name: "deepseek-ai/DeepSeek-R1" | ||
| model-path: "/model/" | ||
| trust-remote-code: true | ||
| disable-radix-cache: true | ||
| kv-cache-dtype: "fp8_e4m3" | ||
| attention-backend: "trtllm_mla" | ||
| quantization: "modelopt_fp4" | ||
| moe-runner-backend: "flashinfer_trtllm" | ||
| stream-interval: 10 | ||
| watchdog-timeout: 1000000 | ||
| context-length: 9200 | ||
| mem-fraction-static: 0.95 | ||
| max-total-tokens: 8192 | ||
| chunked-prefill-size: 8192 | ||
| disable-cuda-graph: true | ||
| max-running-requests: 512 | ||
| scheduler-recv-interval: 10 | ||
| moe-dense-tp-size: 1 | ||
| load-balance-method: "round_robin" | ||
| disaggregation-bootstrap-port: 30001 | ||
| data-parallel-size: 1 | ||
| tensor-parallel-size: 4 | ||
| expert-parallel-size: 1 | ||
|
ishandhanani marked this conversation as resolved.
|
||
|
|
||
| decode: | ||
| disaggregation-mode: "decode" | ||
| served-model-name: "deepseek-ai/DeepSeek-R1" | ||
| model-path: "/model/" | ||
| prefill-round-robin-balance: true | ||
| trust-remote-code: true | ||
| disable-radix-cache: true | ||
| kv-cache-dtype: "fp8_e4m3" | ||
| attention-backend: "trtllm_mla" | ||
| quantization: "modelopt_fp4" | ||
| moe-runner-backend: "flashinfer_trtllm" | ||
| disaggregation-bootstrap-port: 30001 | ||
| stream-interval: 10 | ||
| watchdog-timeout: 1000000 | ||
| context-length: 9200 | ||
| mem-fraction-static: 0.95 | ||
| chunked-prefill-size: 8192 | ||
| cuda-graph-max-bs: 256 | ||
| scheduler-recv-interval: 10 | ||
| moe-dense-tp-size: 1 | ||
| tensor-parallel-size: 4 | ||
| expert-parallel-size: 1 | ||
|
|
||
|
Comment on lines
+83
to
+105
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: rg -n "dp-size|data-parallel-size" --type yaml -A 2 -B 2Repository: ishandhanani/srt-slurm Length of output: 13848 🏁 Script executed: # Also check the full context of the file in question
cat -n recipies/gb200-fp4/1k8k/low-latency.yaml | head -120Repository: ishandhanani/srt-slurm Length of output: 4229 🏁 Script executed: # Search for other decode sections to see patterns
rg -n "^\s+decode:" --type yaml -A 20Repository: ishandhanani/srt-slurm Length of output: 45703 Add The decode section omits 🤖 Prompt for AI Agents |
||
| benchmark: | ||
| type: "sa-bench" | ||
| isl: 1024 | ||
| osl: 8192 | ||
| concurrencies: "4x8x32x64x112" | ||
| req_rate: "inf" | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,177 @@ | ||||||
| # 4P1D, with 12 Decode Nodes. Uses single batch overlap | ||||||
|
|
||||||
| name: "gb200-fp4-max-tpt" | ||||||
|
|
||||||
| model: | ||||||
| path: "dsfp4" | ||||||
| container: "lmsysorg/sglang:dev-cu13" | ||||||
| precision: "fp4" | ||||||
|
|
||||||
| resources: | ||||||
| gpu_type: "gb200" | ||||||
| prefill_nodes: 4 | ||||||
| decode_nodes: 12 | ||||||
| prefill_workers: 4 | ||||||
| decode_workers: 1 | ||||||
| gpus_per_node: 4 | ||||||
|
|
||||||
| backend: | ||||||
|
|
||||||
| # Prefill-specific environment variables | ||||||
| prefill_environment: | ||||||
| TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" | ||||||
| PYTHONUNBUFFERED: "1" | ||||||
| DYN_SKIP_SGLANG_LOG_FORMATTING: "1" | ||||||
| SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" | ||||||
| SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" | ||||||
| SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" | ||||||
| SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" | ||||||
| SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" | ||||||
| SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" | ||||||
| MC_TE_METRIC: "true" | ||||||
| MC_FORCE_MNNVL: "1" | ||||||
| NCCL_MNNVL_ENABLE: "1" | ||||||
| NCCL_CUMEM_ENABLE: "1" | ||||||
| SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" | ||||||
| SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" | ||||||
| SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" | ||||||
|
|
||||||
| # Decode-specific environment variables | ||||||
| decode_environment: | ||||||
| TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" | ||||||
| PYTHONUNBUFFERED: "1" | ||||||
| DYN_SKIP_SGLANG_LOG_FORMATTING: "1" | ||||||
| SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" | ||||||
| SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" | ||||||
| SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" | ||||||
| SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" | ||||||
| SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" | ||||||
| SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" | ||||||
| MC_TE_METRIC: "true" | ||||||
| MC_FORCE_MNNVL: "1" | ||||||
| NCCL_MNNVL_ENABLE: "1" | ||||||
| NCCL_CUMEM_ENABLE: "1" | ||||||
| SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" | ||||||
| SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" | ||||||
| SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" | ||||||
| SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" | ||||||
| SGLANG_MOE_NVFP4_DISPATCH: "1" | ||||||
| SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" | ||||||
|
|
||||||
| sglang_config: | ||||||
| prefill: | ||||||
| # Model configuration | ||||||
| served-model-name: "deepseek-ai/DeepSeek-R1" | ||||||
| model-path: "/model/" | ||||||
| trust-remote-code: true | ||||||
|
|
||||||
| # KV cache and attention | ||||||
| kv-cache-dtype: "fp8_e4m3" | ||||||
| attention-backend: "trtllm_mla" | ||||||
|
|
||||||
| # Quantization | ||||||
| quantization: "modelopt_fp4" | ||||||
| moe-runner-backend: "flashinfer_cutlass" | ||||||
|
|
||||||
| # Radix cache disabled | ||||||
| disable-radix-cache: true | ||||||
| disable-chunked-prefix-cache: true | ||||||
|
|
||||||
| # Other flags | ||||||
| stream-interval: 50 | ||||||
| decode-log-interval: 1000 | ||||||
| watchdog-timeout: 1000000 | ||||||
| context-length: 9200 | ||||||
| disable-shared-experts-fusion: true | ||||||
| eplb-algorithm: "deepseek" | ||||||
| disaggregation-bootstrap-port: 30001 | ||||||
|
|
||||||
| # Prefill-specific mode | ||||||
| disaggregation-mode: "prefill" | ||||||
|
|
||||||
| # Memory and token limits | ||||||
| mem-fraction-static: 0.84 | ||||||
| max-total-tokens: 131072 | ||||||
| max-prefill-tokens: 32768 | ||||||
| chunked-prefill-size: 65536 | ||||||
| enable-single-batch-overlap: true | ||||||
|
|
||||||
| # Request handling | ||||||
| max-running-requests: 30000 | ||||||
| load-balance-method: "round_robin" | ||||||
|
|
||||||
| # Performance optimizations | ||||||
| disable-cuda-graph: true | ||||||
| enable-dp-attention: true | ||||||
|
|
||||||
| # Parallelism | ||||||
| tp-size: 4 | ||||||
| dp-size: 4 | ||||||
| ep-size: 4 | ||||||
|
|
||||||
| decode: | ||||||
| # Model configuration | ||||||
| served-model-name: "deepseek-ai/DeepSeek-R1" | ||||||
| model-path: "/model/" | ||||||
| trust-remote-code: true | ||||||
|
|
||||||
| # KV cache and attention | ||||||
| kv-cache-dtype: "fp8_e4m3" | ||||||
| attention-backend: "trtllm_mla" | ||||||
|
|
||||||
| # Quantization | ||||||
| quantization: "modelopt_fp4" | ||||||
| moe-runner-backend: "flashinfer_cutedsl" | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Possible typo: The prefill section (line 74) uses Proposed fix- moe-runner-backend: "flashinfer_cutedsl"
+ moe-runner-backend: "flashinfer_cutlass"📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||
|
|
||||||
| # Radix cache disabled | ||||||
| disable-radix-cache: true | ||||||
| disable-chunked-prefix-cache: true | ||||||
|
|
||||||
| # Other flags | ||||||
| stream-interval: 50 | ||||||
| decode-log-interval: 1000 | ||||||
| watchdog-timeout: 1000000 | ||||||
| context-length: 9200 | ||||||
| disable-shared-experts-fusion: true | ||||||
| eplb-algorithm: "deepseek" | ||||||
| disaggregation-bootstrap-port: 30001 | ||||||
|
|
||||||
| # Decode-specific mode | ||||||
| disaggregation-mode: "decode" | ||||||
|
|
||||||
| # Memory and token limits | ||||||
| mem-fraction-static: 0.83 | ||||||
| max-total-tokens: 3122380 | ||||||
| chunked-prefill-size: 786432 | ||||||
|
|
||||||
| # Request handling | ||||||
| max-running-requests: 67584 | ||||||
| enable-single-batch-overlap: true | ||||||
|
|
||||||
| # DeepEP configuration | ||||||
| moe-a2a-backend: "deepep" | ||||||
| deepep-mode: "low_latency" | ||||||
| ep-dispatch-algorithm: "static" | ||||||
| ep-num-redundant-experts: 32 | ||||||
|
|
||||||
| # CUDA graphs (extensive batch size list) | ||||||
| cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] | ||||||
| num-reserved-decode-tokens: 112 | ||||||
|
|
||||||
| # Additional decode optimizations | ||||||
| moe-dense-tp-size: 1 | ||||||
| enable-dp-lm-head: true | ||||||
| prefill-round-robin-balance: true | ||||||
| enable-dp-attention: true | ||||||
|
|
||||||
| # Parallelism | ||||||
| tp-size: 48 | ||||||
| dp-size: 48 | ||||||
| ep-size: 48 | ||||||
|
|
||||||
| benchmark: | ||||||
| type: "sa-bench" | ||||||
| isl: 1024 | ||||||
| osl: 8192 | ||||||
| concurrencies: "1x128x512x2048x4096x8192" | ||||||
| req_rate: "inf" | ||||||
Uh oh!
There was an error while loading. Please reload this page.