From 67445264f5f28452dbd692e669e5f07fb12c1f36 Mon Sep 17 00:00:00 2001 From: Nicolas Castet Date: Fri, 23 Jan 2026 15:47:01 -0600 Subject: [PATCH] Fix recipes for sglang v0.5.5 --- recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml | 2 ++ recipies/gb200-fp4/1k1k/max-tpt.yaml | 1 + recipies/gb200-fp4/1k1k/mid-curve.yaml | 3 +- recipies/gb200-fp4/1k8k/max-tpt.yaml | 1 + recipies/gb200-fp4/1k8k/mid-curve.yaml | 1 + recipies/gb200-fp4/8k1k/max-tpt.yaml | 35 ++++++++++++------------ recipies/gb200-fp4/8k1k/mid-curve.yaml | 31 +++++++++++---------- 7 files changed, 41 insertions(+), 33 deletions(-) diff --git a/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml b/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml index 39fccc69..d32da609 100644 --- a/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml +++ b/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml @@ -38,6 +38,7 @@ backend: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1" SGLANG_ENABLE_SPEC_V2: "1" SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1" @@ -65,6 +66,7 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1" SGLANG_ENABLE_SPEC_V2: "1" diff --git a/recipies/gb200-fp4/1k1k/max-tpt.yaml b/recipies/gb200-fp4/1k1k/max-tpt.yaml index 80cb66d9..0cabd2cb 100644 --- a/recipies/gb200-fp4/1k1k/max-tpt.yaml +++ b/recipies/gb200-fp4/1k1k/max-tpt.yaml @@ -56,6 +56,7 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: diff --git a/recipies/gb200-fp4/1k1k/mid-curve.yaml b/recipies/gb200-fp4/1k1k/mid-curve.yaml index 6405b5e0..36eef7ab 100644 --- a/recipies/gb200-fp4/1k1k/mid-curve.yaml +++ b/recipies/gb200-fp4/1k1k/mid-curve.yaml @@ -1,4 +1,4 @@ -# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher +# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher # per gpu throughput name: "gb200-fp4-max-tpt-2" @@ -57,6 +57,7 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml index 97f0f3f1..fe716c81 100644 --- a/recipies/gb200-fp4/1k8k/max-tpt.yaml +++ b/recipies/gb200-fp4/1k8k/max-tpt.yaml @@ -55,6 +55,7 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions sglang_config: prefill: diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml index 1e5771f2..78296911 100644 --- a/recipies/gb200-fp4/1k8k/mid-curve.yaml +++ b/recipies/gb200-fp4/1k8k/mid-curve.yaml @@ -56,6 +56,7 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions sglang_config: prefill: diff --git a/recipies/gb200-fp4/8k1k/max-tpt.yaml b/recipies/gb200-fp4/8k1k/max-tpt.yaml index d4f0fb00..e9bda3a4 100644 --- a/recipies/gb200-fp4/8k1k/max-tpt.yaml +++ b/recipies/gb200-fp4/8k1k/max-tpt.yaml @@ -7,9 +7,9 @@ model: resources: gpu_type: "gb200" - prefill_nodes: 10 - decode_nodes: 8 - prefill_workers: 10 + prefill_nodes: 10 + decode_nodes: 8 + prefill_workers: 10 decode_workers: 1 gpus_per_node: 4 @@ -54,6 +54,7 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: @@ -79,7 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9600 + context-length: 9600 disable-shared-experts-fusion: true disaggregation-bootstrap-port: 30001 @@ -89,8 +90,8 @@ backend: # Memory and token limits mem-fraction-static: 0.95 max-total-tokens: 131072 - max-prefill-tokens: 524288 - chunked-prefill-size: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 # Request handling max-running-requests: 30000 @@ -98,13 +99,13 @@ backend: # Performance optimizations disable-cuda-graph: true - enable-dp-attention: false + enable-dp-attention: false # Parallelism tp-size: 4 dp-size: 1 ep-size: 1 - + decode: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" @@ -127,7 +128,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9600 + context-length: 9600 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -137,11 +138,11 @@ backend: # Memory and token limits mem-fraction-static: 0.83 - max-total-tokens: 524288 - chunked-prefill-size: 24576 + max-total-tokens: 524288 + chunked-prefill-size: 24576 # Request handling - max-running-requests: 16384 + max-running-requests: 16384 # DeepEP configuration moe-a2a-backend: "deepep" @@ -159,13 +160,13 @@ backend: enable-dp-attention: true # Parallelism - tp-size: 32 - dp-size: 32 - ep-size: 32 + tp-size: 32 + dp-size: 32 + ep-size: 32 benchmark: type: "sa-bench" - isl: 8192 + isl: 8192 osl: 1024 concurrencies: "1024x2048x8192" - req_rate: 700 + req_rate: 700 diff --git a/recipies/gb200-fp4/8k1k/mid-curve.yaml b/recipies/gb200-fp4/8k1k/mid-curve.yaml index 58446851..1b0b3246 100644 --- a/recipies/gb200-fp4/8k1k/mid-curve.yaml +++ b/recipies/gb200-fp4/8k1k/mid-curve.yaml @@ -8,7 +8,7 @@ model: resources: gpu_type: "gb200" prefill_nodes: 6 - decode_nodes: 12 + decode_nodes: 12 prefill_workers: 6 decode_workers: 1 gpus_per_node: 4 @@ -54,6 +54,7 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: @@ -79,7 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9600 + context-length: 9600 disable-shared-experts-fusion: true disaggregation-bootstrap-port: 30001 @@ -89,8 +90,8 @@ backend: # Memory and token limits mem-fraction-static: 0.95 max-total-tokens: 131072 - max-prefill-tokens: 524288 - chunked-prefill-size: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 # Request handling max-running-requests: 30000 @@ -98,13 +99,13 @@ backend: # Performance optimizations disable-cuda-graph: true - enable-dp-attention: false + enable-dp-attention: false # Parallelism tp-size: 4 dp-size: 1 ep-size: 1 - + decode: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" @@ -127,7 +128,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9600 + context-length: 9600 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -137,11 +138,11 @@ backend: # Memory and token limits mem-fraction-static: 0.83 - max-total-tokens: 524288 - chunked-prefill-size: 24576 + max-total-tokens: 524288 + chunked-prefill-size: 24576 # Request handling - max-running-requests: 16384 + max-running-requests: 16384 # DeepEP configuration moe-a2a-backend: "deepep" @@ -159,13 +160,13 @@ backend: enable-dp-attention: true # Parallelism - tp-size: 48 - dp-size: 48 - ep-size: 48 + tp-size: 48 + dp-size: 48 + ep-size: 48 benchmark: type: "sa-bench" - isl: 8192 + isl: 8192 osl: 1024 concurrencies: "512x1024x2048x4096" - req_rate: 700 \ No newline at end of file + req_rate: 700 \ No newline at end of file