diff --git a/recipies/gb200-fp4/1k8k/low-latency.yaml b/recipies/gb200-fp4/1k8k/low-latency.yaml index 119be5ca..6c2a9536 100644 --- a/recipies/gb200-fp4/1k8k/low-latency.yaml +++ b/recipies/gb200-fp4/1k8k/low-latency.yaml @@ -1,8 +1,16 @@ name: "gb200-fp4-1p2d" +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + model: path: "dsr1" - container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" + container: "lmsysorg/sglang:v0.5.5.post2" precision: "fp4" resources: @@ -24,8 +32,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -43,8 +49,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -64,7 +68,7 @@ backend: moe-runner-backend: "flashinfer_trtllm" stream-interval: 10 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 mem-fraction-static: 0.95 max-total-tokens: 8192 chunked-prefill-size: 8192 @@ -77,7 +81,6 @@ backend: data-parallel-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 - disaggregation-transfer-backend: nixl decode: disaggregation-mode: "decode" @@ -92,7 +95,7 @@ backend: disaggregation-bootstrap-port: 30001 stream-interval: 10 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 mem-fraction-static: 0.95 chunked-prefill-size: 8192 cuda-graph-max-bs: 256 @@ -100,7 +103,6 @@ backend: moe-dense-tp-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 - disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml index f6ad6141..d2c46140 100644 --- a/recipies/gb200-fp4/1k8k/max-tpt.yaml +++ b/recipies/gb200-fp4/1k8k/max-tpt.yaml @@ -1,10 +1,16 @@ -# 4P1D, with 12 Decode Nodes. Uses single batch overlap - name: "gb200-fp4-max-tpt" +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + model: path: "dsr1" - container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" + container: "lmsysorg/sglang:v0.5.5.post2" precision: "fp4" resources: @@ -56,13 +62,13 @@ backend: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true - disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -80,7 +86,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -112,7 +118,6 @@ backend: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true - disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -130,7 +135,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -228,7 +233,6 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true - fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 48 diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml index bd5f8a23..bf455b72 100644 --- a/recipies/gb200-fp4/1k8k/mid-curve.yaml +++ b/recipies/gb200-fp4/1k8k/mid-curve.yaml @@ -1,11 +1,16 @@ -# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher -# per gpu throughput +name: "gb200-fp4-mid-curve" -name: "gb200-fp4-max-tpt-2" +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: path: "dsr1" - container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030" + container: "lmsysorg/sglang:v0.5.5.post2" precision: "fp4" resources: @@ -57,6 +62,7 @@ backend: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -67,7 +73,6 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" - disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -81,7 +86,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -117,7 +122,6 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" - disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -131,7 +135,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 watchdog-timeout: 1000000 - context-length: 9200 + context-length: 10000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" disaggregation-bootstrap-port: 30001 @@ -228,7 +232,6 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true - fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 32