diff --git a/recipes/gb200-fp4/1k1k/low-latency.yaml b/recipes/gb200-fp4/1k1k/low-latency.yaml index b27f67ec..c953d991 100644 --- a/recipes/gb200-fp4/1k1k/low-latency.yaml +++ b/recipes/gb200-fp4/1k1k/low-latency.yaml @@ -1,8 +1,17 @@ -name: "gb200-fp4-1p2d" +name: "gb200-fp4-1k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,14 +34,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" @@ -44,14 +50,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" sglang_config: prefill: @@ -76,6 +79,8 @@ backend: moe-dense-tp-size: 1 load-balance-method: "round_robin" disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" data-parallel-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 @@ -100,6 +105,8 @@ backend: scheduler-recv-interval: 10 enable-symm-mem: true moe-dense-tp-size: 1 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" tensor-parallel-size: 4 expert-parallel-size: 1 @@ -107,5 +114,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x8x32x64x112x128x256" - req_rate: "inf" \ No newline at end of file + concurrencies: "4x8x32" + req_rate: "inf" diff --git a/recipes/gb200-fp4/1k1k/max-tpt.yaml b/recipes/gb200-fp4/1k1k/max-tpt.yaml index ba31ccfc..4bfd6ccf 100644 --- a/recipes/gb200-fp4/1k1k/max-tpt.yaml +++ b/recipes/gb200-fp4/1k1k/max-tpt.yaml @@ -1,10 +1,17 @@ -# 4P1D, with 12 Decode Nodes. Uses single batch overlap +name: "gb200-fp4-1k1k-max-tpt" -name: "gb200-fp4-max-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -27,7 +34,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -46,7 +52,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -56,8 +61,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -103,6 +106,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -162,6 +167,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 48 @@ -172,5 +179,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1x128x512x2048x4096x8192x12000x15000" - req_rate: "inf" \ No newline at end of file + concurrencies: "512x2048x4096" + req_rate: "inf" diff --git a/recipes/gb200-fp4/1k1k/mid-curve.yaml b/recipes/gb200-fp4/1k1k/mid-curve.yaml index 2365f2c0..d87d9a5a 100644 --- a/recipes/gb200-fp4/1k1k/mid-curve.yaml +++ b/recipes/gb200-fp4/1k1k/mid-curve.yaml @@ -1,11 +1,17 @@ -# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher -# per gpu throughput +name: "gb200-fp4-1k1k-mid-curve" -name: "gb200-fp4-max-tpt-2" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -28,7 +34,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -47,7 +52,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -57,8 +61,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -104,6 +106,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -162,6 +166,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 32 @@ -172,5 +178,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1x128x512x2048x4096x8192x12000x15000" - req_rate: "inf" \ No newline at end of file + concurrencies: "512x2048x4096x8192x15000" + req_rate: "inf" diff --git a/recipes/gb200-fp4/1k8k/low-latency.yaml b/recipes/gb200-fp4/1k8k/low-latency.yaml index 6c2a9536..1153983f 100644 --- a/recipes/gb200-fp4/1k8k/low-latency.yaml +++ b/recipes/gb200-fp4/1k8k/low-latency.yaml @@ -1,16 +1,17 @@ -name: "gb200-fp4-1p2d" +name: "gb200-fp4-1k8k-low-latency" dynamo: - version: 0.7.0 + version: 0.8.1 -frontend: - type: dynamo +frontend: + type: dynamo enable_multiple_frontends: true - num_additional_frontends: 4 + num_additional_frontends: 3 + nginx_container: nginx model: - path: "dsr1" - container: "lmsysorg/sglang:v0.5.5.post2" + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -37,7 +38,6 @@ backend: NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" @@ -54,12 +54,11 @@ backend: NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" sglang_config: prefill: - disaggregation-mode: "prefill" served-model-name: "deepseek-ai/DeepSeek-R1" + disaggregation-mode: "prefill" trust-remote-code: true disable-radix-cache: true kv-cache-dtype: "fp8_e4m3" @@ -81,10 +80,12 @@ backend: data-parallel-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl decode: - disaggregation-mode: "decode" served-model-name: "deepseek-ai/DeepSeek-R1" + disaggregation-mode: "decode" prefill-round-robin-balance: true trust-remote-code: true disable-radix-cache: true @@ -103,6 +104,8 @@ backend: moe-dense-tp-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" diff --git a/recipes/gb200-fp4/1k8k/max-tpt.yaml b/recipes/gb200-fp4/1k8k/max-tpt.yaml index d2c46140..8d75c7be 100644 --- a/recipes/gb200-fp4/1k8k/max-tpt.yaml +++ b/recipes/gb200-fp4/1k8k/max-tpt.yaml @@ -1,16 +1,17 @@ -name: "gb200-fp4-max-tpt" +name: "gb200-fp4-1k8k-max-tpt" dynamo: - version: 0.7.0 + version: 0.8.1 frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: - path: "dsr1" - container: "lmsysorg/sglang:v0.5.5.post2" + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -32,7 +33,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -51,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -61,7 +60,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: @@ -69,6 +67,7 @@ backend: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true + disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -108,6 +107,7 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 4 @@ -118,6 +118,7 @@ backend: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true + disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -233,6 +234,7 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 48 @@ -243,5 +245,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 8192 - concurrencies: "256x512x1024x2048x8192" + concurrencies: "256x512x1024x2048" req_rate: "inf" diff --git a/recipes/gb200-fp4/1k8k/mid-curve.yaml b/recipes/gb200-fp4/1k8k/mid-curve.yaml index bf455b72..01141454 100644 --- a/recipes/gb200-fp4/1k8k/mid-curve.yaml +++ b/recipes/gb200-fp4/1k8k/mid-curve.yaml @@ -1,16 +1,17 @@ -name: "gb200-fp4-mid-curve" +name: "gb200-fp4-1k8k-mid-curve" dynamo: - version: 0.7.0 + version: 0.8.1 -frontend: - type: dynamo +frontend: + type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: - path: "dsr1" - container: "lmsysorg/sglang:v0.5.5.post2" + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -32,7 +33,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -51,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -61,8 +60,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -73,6 +70,7 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" + disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -108,6 +106,7 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 4 @@ -122,6 +121,7 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" + disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -232,6 +232,7 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 32 diff --git a/recipes/gb200-fp4/8k1k/low-latency.yaml b/recipes/gb200-fp4/8k1k/low-latency.yaml index 73a88588..f274b863 100644 --- a/recipes/gb200-fp4/8k1k/low-latency.yaml +++ b/recipes/gb200-fp4/8k1k/low-latency.yaml @@ -1,8 +1,17 @@ -name: "gb200-8k1k-fp4-low-latency-1p_tp4/4d_tp4" +name: "gb200-fp4-8k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + nginx_container: nginx model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,14 +34,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" @@ -44,14 +50,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" sglang_config: prefill: @@ -77,6 +80,8 @@ backend: load-balance-method: "round_robin" disaggregation-bootstrap-port: 30001 data-parallel-size: 1 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" tensor-parallel-size: 4 expert-parallel-size: 1 enable-dp-attention: false @@ -101,12 +106,15 @@ backend: scheduler-recv-interval: 10 enable-symm-mem: true moe-dense-tp-size: 1 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" tensor-parallel-size: 4 expert-parallel-size: 1 enable-dp-attention: false + benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4x8x32x64" + concurrencies: "4x8" req_rate: 300 diff --git a/recipes/gb200-fp4/8k1k/max-tpt.yaml b/recipes/gb200-fp4/8k1k/max-tpt.yaml index 26b4629d..2164891a 100644 --- a/recipes/gb200-fp4/8k1k/max-tpt.yaml +++ b/recipes/gb200-fp4/8k1k/max-tpt.yaml @@ -1,8 +1,17 @@ -name: "gb200-8k1k-fp4-max-10p_tp4/1d_dep32" +name: "gb200-fp4-8k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,7 +34,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -44,7 +52,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -54,8 +61,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -99,6 +104,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -156,6 +163,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 32 @@ -166,5 +175,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1024x2048x8192" + concurrencies: "2048" req_rate: 700 diff --git a/recipes/gb200-fp4/8k1k/mid-curve.yaml b/recipes/gb200-fp4/8k1k/mid-curve.yaml index f1e9bb41..43ed73fd 100644 --- a/recipes/gb200-fp4/8k1k/mid-curve.yaml +++ b/recipes/gb200-fp4/8k1k/mid-curve.yaml @@ -1,8 +1,17 @@ -name: "gb200-8k1k-fp4-mid-6p_tp4/1d_dep48" +name: "gb200-fp4-8k1k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,7 +34,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -44,7 +52,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -54,8 +61,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -99,6 +104,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -156,6 +163,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 48 @@ -166,5 +175,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "512x1024x2048x4096" - req_rate: 700 \ No newline at end of file + concurrencies: "512x2048x4096" + req_rate: 700 diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml index 7ce9daf3..4349af7d 100644 --- a/recipes/gb200-fp8/1k1k/low-latency.yaml +++ b/recipes/gb200-fp8/1k1k/low-latency.yaml @@ -1,16 +1,25 @@ -name: "gb200-fp8-1p-4d-low-latency" +name: "gb200-fp8-1k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: gpu_type: "gb200" prefill_nodes: 1 - decode_nodes: 4 + decode_nodes: 1 prefill_workers: 1 - decode_workers: 4 + decode_workers: 1 gpus_per_node: 4 backend: @@ -18,9 +27,8 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" @@ -36,14 +44,13 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" @@ -72,12 +79,14 @@ backend: max-running-requests: 512 load-balance-method: "round_robin" scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true + fp8-gemm-backend: "flashinfer_trtllm" enable-symm-mem: true moe-dense-tp-size: 1 tensor-parallel-size: 4 data-parallel-size: 1 expert-parallel-size: 1 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl decode: served-model-name: "deepseek-ai/DeepSeek-R1" @@ -94,19 +103,21 @@ backend: mem-fraction-static: 0.95 chunked-prefill-size: 8192 cuda-graph-max-bs: 128 - max-running-requests: 512 + max-running-requests: 128 scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true enable-symm-mem: true moe-dense-tp-size: 1 prefill-round-robin-balance: true tensor-parallel-size: 4 data-parallel-size: 1 expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x8x32x64x80x96x112x128" - req_rate: "inf" \ No newline at end of file + concurrencies: "4x8" + req_rate: "inf" diff --git a/recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml b/recipes/gb200-fp8/1k1k/max-tpt.yaml similarity index 90% rename from recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml rename to recipes/gb200-fp8/1k1k/max-tpt.yaml index e1859cec..2e6dfcbe 100644 --- a/recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml +++ b/recipes/gb200-fp8/1k1k/max-tpt.yaml @@ -1,10 +1,17 @@ -# GB200 FP8 Max Throughput Configuration +name: "gb200-fp8-1k1k-max-tpt" -name: "gb200-fp8-max-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -20,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -37,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" @@ -45,7 +52,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -106,6 +112,8 @@ backend: ep-num-redundant-experts: 32 deepep-config: "/configs/deepep_config.json" + disaggregation-transfer-backend: nixl + decode: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" @@ -156,10 +164,12 @@ backend: cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] cuda-graph-max-bs: 768 + disaggregation-transfer-backend: nixl + benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1024x2048x4096" + concurrencies: "1024x2048x4096x6144" req_rate: "inf" diff --git a/recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml b/recipes/gb200-fp8/1k1k/mid-curve.yaml similarity index 90% rename from recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml rename to recipes/gb200-fp8/1k1k/mid-curve.yaml index 36bbfb7e..7b59b995 100644 --- a/recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml +++ b/recipes/gb200-fp8/1k1k/mid-curve.yaml @@ -1,10 +1,17 @@ -# GB200 FP8 Max Throughput Configuration +name: "gb200-fp8-1k1k-mid-curve" -name: "gb200-fp8-max-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -20,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -37,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" @@ -45,7 +52,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -105,6 +111,7 @@ backend: enable-dp-lm-head: true ep-num-redundant-experts: 32 deepep-config: "/configs/deepep_config.json" + disaggregation-transfer-backend: nixl decode: # Model configuration @@ -113,9 +120,9 @@ backend: trust-remote-code: true # Parallelism - tp-size: 32 - dp-size: 32 - ep-size: 32 + tp-size: 48 + dp-size: 48 + ep-size: 48 enable-dp-attention: true # KV cache and attention @@ -155,6 +162,8 @@ backend: # CUDA graphs cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] cuda-graph-max-bs: 768 + disaggregation-transfer-backend: nixl + benchmark: type: "sa-bench" diff --git a/recipes/gb200-fp8/1k8k/low_latency.yaml b/recipes/gb200-fp8/1k8k/low_latency.yaml new file mode 100644 index 00000000..a24ea169 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/low_latency.yaml @@ -0,0 +1,125 @@ +name: "gb200-fp8-1k8k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + disaggregation-mode: "prefill" + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 512 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false + fp8-gemm-backend: "flashinfer_trtllm" + enable-symm-mem: true + moe-dense-tp-size: 1 + disaggregation-bootstrap-port: 30001 + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + disaggregation-mode: "decode" + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false + enable-symm-mem: false #true + moe-dense-tp-size: 1 + disaggregation-bootstrap-port: 30001 + prefill-round-robin-balance: true + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "4x8" + req_rate: "inf" diff --git a/recipes/gb200-fp8/1k8k/max_tpt.yaml b/recipes/gb200-fp8/1k8k/max_tpt.yaml new file mode 100644 index 00000000..904acf89 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/max_tpt.yaml @@ -0,0 +1,175 @@ +name: "gb200-fp8-1k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + prefill_workers: 2 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1024x2048x4096x6144" + req_rate: "inf" + diff --git a/recipes/gb200-fp8/1k8k/mid_curve.yaml b/recipes/gb200-fp8/1k8k/mid_curve.yaml new file mode 100644 index 00000000..5c894a80 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/mid_curve.yaml @@ -0,0 +1,174 @@ +name: "gb200-fp8-1k8k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + prefill_workers: 3 + decode_nodes: 12 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + disaggregation-transfer-backend: nixl + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1024x2048x4096x6144" + req_rate: "inf" + diff --git a/recipes/gb200-fp8/8k1k/low-latency.yaml b/recipes/gb200-fp8/8k1k/low-latency.yaml index 52ea1d89..83a12e2e 100644 --- a/recipes/gb200-fp8/8k1k/low-latency.yaml +++ b/recipes/gb200-fp8/8k1k/low-latency.yaml @@ -1,14 +1,23 @@ -name: "gb200-fp8-8k1k-1p-1d-low-latency" +name: "gb200-fp8-8k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: gpu_type: "gb200" - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 gpus_per_node: 4 @@ -18,9 +27,8 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" @@ -36,14 +44,12 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" @@ -64,18 +70,20 @@ backend: watchdog-timeout: 1000000 context-length: 9600 disaggregation-mode: "prefill" - mem-fraction-static: 0.95 + mem-fraction-static: 0.8 max-total-tokens: 32768 chunked-prefill-size: 24576 cuda-graph-max-bs: 512 max-running-requests: 512 load-balance-method: "round_robin" scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true moe-dense-tp-size: 1 - tensor-parallel-size: 4 + tensor-parallel-size: 8 data-parallel-size: 1 expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl decode: served-model-name: "deepseek-ai/DeepSeek-R1" @@ -88,22 +96,24 @@ backend: watchdog-timeout: 1000000 context-length: 9600 disaggregation-mode: "decode" - mem-fraction-static: 0.95 + mem-fraction-static: 0.8 chunked-prefill-size: 8192 cuda-graph-max-bs: 512 max-running-requests: 512 scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true enable-symm-mem: true moe-dense-tp-size: 1 prefill-round-robin-balance: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 data-parallel-size: 1 expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4x8x16x32" + concurrencies: "4x8x16" req_rate: "inf" diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml new file mode 100644 index 00000000..d78dafb2 --- /dev/null +++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml @@ -0,0 +1,171 @@ +name: "gb200-8k1k-fp8-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 12 + prefill_workers: 6 + decode_nodes: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.80 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 24 + dp-size: 24 + ep-size: 24 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 8192 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512] + cuda-graph-max-bs: 512 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048x4096x6144" + req_rate: "300" diff --git a/recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml similarity index 91% rename from recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml rename to recipes/gb200-fp8/8k1k/mid-curve.yaml index 4c6fff6d..98674cc6 100644 --- a/recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml +++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml @@ -1,10 +1,17 @@ -# GB200 FP8 Mid curve Configuration - name: "gb200-8k1k-fp8-mid-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -20,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -37,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "256" MC_TE_METRIC: "true" @@ -45,7 +52,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -81,7 +87,6 @@ backend: watchdog-timeout: 1000000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" - disaggregation-bootstrap-port: 30001 # Prefill-specific mode disaggregation-mode: "prefill" @@ -105,6 +110,8 @@ backend: enable-dp-lm-head: true ep-num-redundant-experts: 32 deepep-config: "/configs/deepep_config.json" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl decode: # Model configuration @@ -133,7 +140,6 @@ backend: watchdog-timeout: 1000000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" - disaggregation-bootstrap-port: 30001 # Decode-specific mode disaggregation-mode: "decode" @@ -153,6 +159,8 @@ backend: deepep-config: "/configs/deepep_config.json" # CUDA graphs cuda-graph-max-bs: 256 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench"