diff --git a/recipes/gb200-fp8/1k8k/low-latency.yaml b/recipes/gb200-fp8/1k8k/low-latency.yaml new file mode 100644 index 00000000..52355f55 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/low-latency.yaml @@ -0,0 +1,120 @@ +name: "gb200-fp8-1k8k-low-latency" + +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.5.post2" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_node: 4 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_FLASHINFER_GEMM: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_FLASHINFER_GEMM: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + disaggregation-mode: "prefill" + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 512 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + enable-symm-mem: true + moe-dense-tp-size: 1 + disaggregation-bootstrap-port: 30001 + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + disaggregation-mode: "decode" + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false + enable-symm-mem: true + moe-dense-tp-size: 1 + prefill-round-robin-balance: true + disaggregation-bootstrap-port: 30001 + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "4x8x32x64x80x96x112x128" + req_rate: "inf" diff --git a/recipes/gb200-fp8/1k8k/max-tpt.yaml b/recipes/gb200-fp8/1k8k/max-tpt.yaml new file mode 100644 index 00000000..dd3c4868 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/max-tpt.yaml @@ -0,0 +1,170 @@ +name: "gb200-fp8-1k8k-max-tpt" + +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.5.post2" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + prefill_workers: 2 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1024x2048x4096" + req_rate: "inf" + diff --git a/recipes/gb200-fp8/1k8k/mid-curve.yaml b/recipes/gb200-fp8/1k8k/mid-curve.yaml new file mode 100644 index 00000000..f33e7387 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/mid-curve.yaml @@ -0,0 +1,171 @@ +name: "gb200-fp8-1k8k-mid-curve" + +dynamo: + version: 0.7.0 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + +model: + path: "dsfp8" + container: "lmsysorg/sglang:v0.5.5.post2" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + prefill_workers: 3 + decode_nodes: 12 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1024x2048x4096" + req_rate: "inf" +