diff --git a/recipies/gb200-fp4/1k8k/low-latency.yaml b/recipies/gb200-fp4/1k8k/low-latency.yaml new file mode 100644 index 00000000..933e4a14 --- /dev/null +++ b/recipies/gb200-fp4/1k8k/low-latency.yaml @@ -0,0 +1,111 @@ +name: "gb200-fp4-1p2d" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:dev-cu13" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_node: 4 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 9200 + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + disable-cuda-graph: true + max-running-requests: 512 + scheduler-recv-interval: 10 + moe-dense-tp-size: 1 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 9200 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + scheduler-recv-interval: 10 + moe-dense-tp-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "4x8x32x64x112" + req_rate: "inf" diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml new file mode 100644 index 00000000..8a8fed77 --- /dev/null +++ b/recipies/gb200-fp4/1k8k/max-tpt.yaml @@ -0,0 +1,177 @@ +# 4P1D, with 12 Decode Nodes. Uses single batch overlap + +name: "gb200-fp4-max-tpt" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:dev-cu13" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + decode_nodes: 12 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9200 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9200 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + enable-single-batch-overlap: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1x128x512x2048x4096x8192" + req_rate: "inf" diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml new file mode 100644 index 00000000..c461fa54 --- /dev/null +++ b/recipies/gb200-fp4/1k8k/mid-curve.yaml @@ -0,0 +1,177 @@ +# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher +# per gpu throughput + +name: "gb200-fp4-max-tpt-2" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:dev-cu13" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + decode_nodes: 8 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9200 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9200 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1x128x512x2048x4096x8192" + req_rate: "inf"