diff --git a/recipes/gb300-fp4/1k1k/low_latency.yaml b/recipes/gb300-fp4/1k1k/low_latency.yaml new file mode 100644 index 00000000..3ec9dc7d --- /dev/null +++ b/recipes/gb300-fp4/1k1k/low_latency.yaml @@ -0,0 +1,118 @@ +name: "gb300-fp4-low-latency-1k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_node: 4 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + enable-symm-mem: true + moe-dense-tp-size: 1 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + scheduler-recv-interval: 10 + enable-symm-mem: true + moe-dense-tp-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x32" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k/max_tpt.yaml b/recipes/gb300-fp4/1k1k/max_tpt.yaml new file mode 100644 index 00000000..e762725c --- /dev/null +++ b/recipes/gb300-fp4/1k1k/max_tpt.yaml @@ -0,0 +1,184 @@ +name: "gb300-fp4-max-tpt-1k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + decode_nodes: 12 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + enable-single-batch-overlap: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x2048x4096x8192" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k1k/mid_curve.yaml b/recipes/gb300-fp4/1k1k/mid_curve.yaml new file mode 100644 index 00000000..e7a0abba --- /dev/null +++ b/recipes/gb300-fp4/1k1k/mid_curve.yaml @@ -0,0 +1,182 @@ +name: "gb300-fp4-mid-curve-1k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + decode_nodes: 8 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x2048x4096x8192x12000x15000" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k8k/low-latency.yaml b/recipes/gb300-fp4/1k8k/low-latency.yaml new file mode 100644 index 00000000..20290618 --- /dev/null +++ b/recipes/gb300-fp4/1k8k/low-latency.yaml @@ -0,0 +1,115 @@ +name: "gb300-fp4-1k8k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_node: 4 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + disaggregation-mode: "prefill" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + disable-cuda-graph: true + max-running-requests: 512 + scheduler-recv-interval: 10 + moe-dense-tp-size: 1 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + disaggregation-mode: "decode" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + scheduler-recv-interval: 10 + moe-dense-tp-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "4x8x16x32" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k8k/max-tpt.yaml b/recipes/gb300-fp4/1k8k/max-tpt.yaml new file mode 100644 index 00000000..2af9ce6f --- /dev/null +++ b/recipes/gb300-fp4/1k8k/max-tpt.yaml @@ -0,0 +1,248 @@ +name: "gb300-fp4-1k8k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + decode_nodes: 12 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 10000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 10000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + enable-single-batch-overlap: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: + [ + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 40, + 48, + 56, + 64, + 72, + 80, + 88, + 96, + 104, + 112, + 120, + 128, + 136, + 144, + 152, + 160, + 168, + 176, + 184, + 192, + 200, + 208, + 216, + 224, + 232, + 240, + 248, + 256, + 264, + 272, + 280, + 288, + 296, + 304, + 312, + 320, + 328, + 336, + 344, + 352, + 360, + 368, + 376, + 384, + 416, + 448, + 480, + 512, + 544, + 576, + 608, + 640, + 672, + 704, + 736, + 768, + 1024, + ] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "256x512x1024x2048" + req_rate: "inf" diff --git a/recipes/gb300-fp4/1k8k/mid-curve.yaml b/recipes/gb300-fp4/1k8k/mid-curve.yaml new file mode 100644 index 00000000..f4883606 --- /dev/null +++ b/recipes/gb300-fp4/1k8k/mid-curve.yaml @@ -0,0 +1,247 @@ +name: "gb300-fp4-1k8k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + decode_nodes: 8 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + disaggregation-transfer-backend: nixl + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 10000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + disaggregation-transfer-backend: nixl + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 10000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: + [ + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 40, + 48, + 56, + 64, + 72, + 80, + 88, + 96, + 104, + 112, + 120, + 128, + 136, + 144, + 152, + 160, + 168, + 176, + 184, + 192, + 200, + 208, + 216, + 224, + 232, + 240, + 248, + 256, + 264, + 272, + 280, + 288, + 296, + 304, + 312, + 320, + 328, + 336, + 344, + 352, + 360, + 368, + 376, + 384, + 416, + 448, + 480, + 512, + 544, + 576, + 608, + 640, + 672, + 704, + 736, + 768, + 1024, + ] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Only for 0.5.8 + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "2048x4096x8192" + req_rate: "inf" diff --git a/recipes/gb300-fp4/8k1k/low_latency.yaml b/recipes/gb300-fp4/8k1k/low_latency.yaml new file mode 100644 index 00000000..f0c07e9d --- /dev/null +++ b/recipes/gb300-fp4/8k1k/low_latency.yaml @@ -0,0 +1,121 @@ +name: "gb300-8k1k-fp4-low-latency-8k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_node: 4 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + enable-symm-mem: true + moe-dense-tp-size: 1 + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + scheduler-recv-interval: 10 + enable-symm-mem: true + moe-dense-tp-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x32x64" + req_rate: 300 diff --git a/recipes/gb300-fp4/8k1k/max_tpt.yaml b/recipes/gb300-fp4/8k1k/max_tpt.yaml new file mode 100644 index 00000000..7d57ab7b --- /dev/null +++ b/recipes/gb300-fp4/8k1k/max_tpt.yaml @@ -0,0 +1,179 @@ +name: "gb300-fp4-8k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 10 + decode_nodes: 8 + prefill_workers: 10 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.95 + max-total-tokens: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 1 + ep-size: 1 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 524288 + chunked-prefill-size: 24576 + + # Request handling + max-running-requests: 16384 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + cuda-graph-max-bs: 512 + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: 700 diff --git a/recipes/gb300-fp4/8k1k/mid_curve.yaml b/recipes/gb300-fp4/8k1k/mid_curve.yaml new file mode 100644 index 00000000..8bd3922f --- /dev/null +++ b/recipes/gb300-fp4/8k1k/mid_curve.yaml @@ -0,0 +1,179 @@ +name: "gb300-fp4-8k1k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 6 + decode_nodes: 12 + prefill_workers: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.95 + max-total-tokens: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 1 + ep-size: 1 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 524288 + chunked-prefill-size: 24576 + + # Request handling + max-running-requests: 16384 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + cuda-graph-max-bs: 512 + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x2048x4096" + req_rate: 700