diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml index ab71d0a4..0f219d07 100644 --- a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml +++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml @@ -1,7 +1,7 @@ name: "b200-fp4-low-latency-dep4-1p-tep8-5d" model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" @@ -135,5 +135,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "16x128x512" + concurrencies: "16x128" req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml index 2ac9fbeb..55347d69 100644 --- a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml +++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml @@ -1,7 +1,7 @@ name: "b200-fp4-low-latency-dep4-1p-tep8-6d" model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" @@ -135,5 +135,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "32x64x256x512" + concurrencies: "32x64x256" req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml index 077bc0d9..7e617cb2 100644 --- a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml +++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml @@ -1,7 +1,7 @@ name: "b200-fp4-max-tpt-dep4-1p-dep8-1d" model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" @@ -139,5 +139,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "512x1024" + concurrencies: "512" req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml index 890e35ed..51051ce4 100644 --- a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml +++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml @@ -1,7 +1,7 @@ name: "b200-fp4-max-tpt-dep4-1p-dep8-2d" model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml index 12f9adab..03a930d5 100644 --- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml +++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml @@ -1,7 +1,15 @@ name: "b200-fp4-low-latency-dep4-1p-tep8-1d" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" @@ -89,6 +97,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" decode: # Model configuration @@ -126,6 +135,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" health_check: max_attempts: 360 diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml index 5cd343fe..ca4684d7 100644 --- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml +++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml @@ -1,7 +1,15 @@ name: "b200-fp4-low-latency-dep4-1p-tep8-5d" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" @@ -89,6 +97,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" decode: # Model configuration @@ -126,6 +135,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" health_check: max_attempts: 360 @@ -135,5 +145,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "32" + concurrencies: "8" req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml index 502cc023..450fbcba 100644 --- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml +++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml @@ -1,7 +1,15 @@ name: "b200-fp4-low-latency-dep4-2p-tep8-5d" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" @@ -89,6 +97,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" decode: # Model configuration @@ -126,6 +135,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" health_check: max_attempts: 360 @@ -135,5 +145,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "64x128" + concurrencies: "4x128" req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml similarity index 83% rename from recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml rename to recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml index 7331d670..f1e3c39f 100644 --- a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml +++ b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml @@ -1,14 +1,22 @@ -name: "b200-fp4-max-tpt-dep4-4p-dep8-1d" +name: "b200-fp4-low-latency-tp4-1p-tp8-1d" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" resources: gpu_type: "b200" - prefill_nodes: 4 - prefill_workers: 4 + prefill_nodes: 1 + prefill_workers: 1 gpus_per_prefill: 4 decode_nodes: 1 decode_workers: 1 @@ -48,8 +56,6 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" - SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" DYN_REQUEST_PLANE: nats sglang_config: @@ -68,15 +74,15 @@ backend: max-prefill-tokens: 32768 chunked-prefill-size: 32768 context-length: 9600 - max-running-requests: 1024 + max-running-requests: 512 disable-cuda-graph: true # Parallelism tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true + data-parallel-size: 1 + expert-parallel-size: 1 +# enable-dp-attention: false +# enable-dp-lm-head: true # Attention attention-backend: "trtllm_mla" @@ -91,6 +97,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" decode: # Model configuration @@ -107,15 +114,13 @@ backend: max-prefill-tokens: 32768 chunked-prefill-size: 32768 context-length: 9600 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 + max-running-requests: 512 + cuda-graph-max-bs: 512 # Parallelism tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - enable-dp-lm-head: true + data-parallel-size: 1 + expert-parallel-size: 1 # Attention attention-backend: "trtllm_mla" @@ -123,13 +128,14 @@ backend: # MoE moe-runner-backend: "flashinfer_trtllm" - moe-dense-tp-size: 1 + # moe-dense-tp-size: 1 # Other flags stream-interval: 30 watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" health_check: max_attempts: 360 @@ -139,5 +145,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1024" + concurrencies: "4x8x16x64" req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml index 0d645ec0..a9f0d01e 100644 --- a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml +++ b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml @@ -1,7 +1,15 @@ name: "b200-fp4-max-tpt-dep4-7p-dep8-2d" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + model: - path: "dsr1-fp4" + path: "dsr1" container: "dynamo-sglang" precision: "fp4" @@ -49,7 +57,6 @@ backend: NCCL_CUMEM_ENABLE: "1" SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" DYN_REQUEST_PLANE: nats sglang_config: @@ -65,8 +72,8 @@ backend: # Memory and token limits mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 + max-prefill-tokens: 65536 + chunked-prefill-size: 65536 context-length: 9600 max-running-requests: 1024 disable-cuda-graph: true @@ -91,6 +98,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_cutlass" decode: # Model configuration @@ -107,7 +115,7 @@ backend: max-prefill-tokens: 32768 chunked-prefill-size: 32768 context-length: 9600 - max-running-requests: 1024 + max-running-requests: 2048 cuda-graph-max-bs: 1024 # Parallelism @@ -130,6 +138,7 @@ backend: watchdog-timeout: 1000000 enable-flashinfer-allreduce-fusion: true disable-radix-cache: true + fp4-gemm-backend: "flashinfer_cutlass" health_check: max_attempts: 360 @@ -139,5 +148,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1024" + concurrencies: "1024x2048" req_rate: "inf"