diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml new file mode 100644 index 00000000..15c552f0 --- /dev/null +++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml @@ -0,0 +1,139 @@ +name: "b200-fp4-low-latency-dep4-1p-tep8-5d" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16x128x512" + req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml new file mode 100644 index 00000000..d70696ed --- /dev/null +++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml @@ -0,0 +1,139 @@ +name: "b200-fp4-low-latency-dep4-1p-tep8-6d" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32x64x256x512" + req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml new file mode 100644 index 00000000..6be9fbf8 --- /dev/null +++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml @@ -0,0 +1,143 @@ +name: "b200-fp4-max-tpt-dep4-1p-dep8-1d" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml new file mode 100644 index 00000000..d3d1ad0f --- /dev/null +++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml @@ -0,0 +1,143 @@ +name: "b200-fp4-max-tpt-dep4-1p-dep8-2d" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 2 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512" + req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml new file mode 100644 index 00000000..a0c39932 --- /dev/null +++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml @@ -0,0 +1,149 @@ +name: "b200-fp4-low-latency-dep4-1p-tep8-1d" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128" + req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml new file mode 100644 index 00000000..f0ee8950 --- /dev/null +++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml @@ -0,0 +1,149 @@ +name: "b200-fp4-low-latency-dep4-1p-tep8-5d" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8" + req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml new file mode 100644 index 00000000..d085f621 --- /dev/null +++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml @@ -0,0 +1,149 @@ +name: "b200-fp4-low-latency-dep4-2p-tep8-5d" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x128" + req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml new file mode 100644 index 00000000..74a0b8b8 --- /dev/null +++ b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml @@ -0,0 +1,149 @@ +name: "b200-fp4-low-latency-tp4-1p-tp8-1d" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 +# enable-dp-attention: false +# enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_trtllm" + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x64" + req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml new file mode 100644 index 00000000..b61f50b1 --- /dev/null +++ b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml @@ -0,0 +1,143 @@ +name: "b200-fp4-max-tpt-dep4-4p-dep8-1d" + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 1024 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml new file mode 100644 index 00000000..04373acf --- /dev/null +++ b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml @@ -0,0 +1,152 @@ +name: "b200-fp4-max-tpt-dep4-7p-dep8-2d" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + +model: + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 2 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_MOE_NVFP4_DISPATCH: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 65536 + chunked-prefill-size: 65536 + context-length: 9600 + max-running-requests: 1024 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_cutlass" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 2048 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + fp4-gemm-backend: "flashinfer_cutlass" + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048" + req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml new file mode 100644 index 00000000..892a8cab --- /dev/null +++ b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml @@ -0,0 +1,134 @@ +name: "b200-fp8-low-latency-tep8-1p-1d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml new file mode 100644 index 00000000..ec0a691c --- /dev/null +++ b/recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml @@ -0,0 +1,134 @@ +name: "b200-fp8-low-latency-tep8-1p-3d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16x32x64x128x256" + req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml new file mode 100644 index 00000000..852990c2 --- /dev/null +++ b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml @@ -0,0 +1,138 @@ +name: "b200-fp8-max-tpt-dep8-1p-5d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" diff --git a/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml new file mode 100644 index 00000000..27150ce1 --- /dev/null +++ b/recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml @@ -0,0 +1,138 @@ +name: "b200-fp8-max-tpt-dep8-2p-5d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml new file mode 100644 index 00000000..6cedfa72 --- /dev/null +++ b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p1d.yaml @@ -0,0 +1,134 @@ +name: "b200-fp8-low-latency-tep8-1p-1d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x32x64" + req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml new file mode 100644 index 00000000..6eac575e --- /dev/null +++ b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p4d.yaml @@ -0,0 +1,134 @@ +name: "b200-fp8-low-latency-tep8-1p-4d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64" + req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml new file mode 100644 index 00000000..69804809 --- /dev/null +++ b/recipes/b200-fp8/8k1k/stp/low-latency-tep8-1p6d.yaml @@ -0,0 +1,134 @@ +name: "b200-fp8-low-latency-tep8-1p-6d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml new file mode 100644 index 00000000..3f3fcf6f --- /dev/null +++ b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-1p1d.yaml @@ -0,0 +1,138 @@ +name: "b200-fp8-max-tpt-dep8-1p-1d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 1024 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" diff --git a/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml new file mode 100644 index 00000000..0549b9dd --- /dev/null +++ b/recipes/b200-fp8/8k1k/stp/max-tpt-dep8-2p1d.yaml @@ -0,0 +1,138 @@ +name: "b200-fp8-max-tpt-dep8-2p-1d" + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 1024 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + req_rate: "inf"