From c02d59d22afe402b9613723c8dd1e00fc0763f0a Mon Sep 17 00:00:00 2001 From: Grace Ho Date: Thu, 29 Jan 2026 12:04:17 -0800 Subject: [PATCH 1/2] configs for gb300-fp8-no-mtp --- recipies/gb300-fp8/1k1k/stp/low-latency.yaml | 124 ++++++++++++++ recipies/gb300-fp8/1k1k/stp/max.yaml | 170 +++++++++++++++++++ recipies/gb300-fp8/1k1k/stp/mid.yaml | 169 ++++++++++++++++++ recipies/gb300-fp8/8k1k/stp/low-latency.yaml | 123 ++++++++++++++ recipies/gb300-fp8/8k1k/stp/max.yaml | 170 +++++++++++++++++++ recipies/gb300-fp8/8k1k/stp/mid.yaml | 170 +++++++++++++++++++ 6 files changed, 926 insertions(+) create mode 100644 recipies/gb300-fp8/1k1k/stp/low-latency.yaml create mode 100644 recipies/gb300-fp8/1k1k/stp/max.yaml create mode 100644 recipies/gb300-fp8/1k1k/stp/mid.yaml create mode 100644 recipies/gb300-fp8/8k1k/stp/low-latency.yaml create mode 100644 recipies/gb300-fp8/8k1k/stp/max.yaml create mode 100644 recipies/gb300-fp8/8k1k/stp/mid.yaml diff --git a/recipies/gb300-fp8/1k1k/stp/low-latency.yaml b/recipies/gb300-fp8/1k1k/stp/low-latency.yaml new file mode 100644 index 00000000..e67bc736 --- /dev/null +++ b/recipies/gb300-fp8/1k1k/stp/low-latency.yaml @@ -0,0 +1,124 @@ +name: "gb300-1k1k-fp8-low-latency" + +model: + path: "dsfp8" + container: "sglang0p5p8_cu13" + precision: "fp8" + +extra_mount: # add this if you need to mount extra directories to the container + - "/lustre:/lustre" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_node: 4 + +slurm: + time_limit: "02:00:00" + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + max-prefill-tokens: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 128 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + moe-dense-tp-size: 1 + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + chunked-prefill-size: -1 # save mem + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 1 # save mem + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + moe-dense-tp-size: 1 + prefill-round-robin-balance: true + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [4,8,16,32] + req_rate: "inf" + diff --git a/recipies/gb300-fp8/1k1k/stp/max.yaml b/recipies/gb300-fp8/1k1k/stp/max.yaml new file mode 100644 index 00000000..2d7c65dd --- /dev/null +++ b/recipies/gb300-fp8/1k1k/stp/max.yaml @@ -0,0 +1,170 @@ +# GB200 FP8 Max Throughput Configuration + +name: "gb300-1k1k-fp8-max" + +model: + path: "dsfp8" + container: "sglang0p5p8_cu13" + precision: "fp8" + +extra_mount: # add this if you need to mount extra directories to the container + - "/lustre:/lustre" +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 2200 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + cuda-graph-max-bs: 1024 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [4096,7168,7680] + req_rate: "inf" + diff --git a/recipies/gb300-fp8/1k1k/stp/mid.yaml b/recipies/gb300-fp8/1k1k/stp/mid.yaml new file mode 100644 index 00000000..77df3500 --- /dev/null +++ b/recipies/gb300-fp8/1k1k/stp/mid.yaml @@ -0,0 +1,169 @@ +# GB200 FP8 Max Throughput Configuration +name: "gb300-1k1k-fp8-mid" + +model: + path: "dsfp8" + container: "sglang0p5p8_cu13" + precision: "fp8" + +extra_mount: # add this if you need to mount extra directories to the container + - "/lustre:/lustre" +resources: + gpu_type: "gb300" + prefill_nodes: 4 + prefill_workers: 2 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 2200 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [1024,2048,4096,6144] + req_rate: "inf" + diff --git a/recipies/gb300-fp8/8k1k/stp/low-latency.yaml b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml new file mode 100644 index 00000000..27300e8b --- /dev/null +++ b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml @@ -0,0 +1,123 @@ +name: "gb300-8k1k-fp8-low-latency" + +model: + path: "dsfp8" + container: "sglang0p5p8_cu13" + precision: "fp8" + +extra_mount: # add this if you need to mount extra directories to the container + - "/lustre:/lustre" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +slurm: + time_limit: "02:00:00" + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 9300 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 32768 + max-prefill-tokens: 32768 + cuda-graph-max-bs: 128 + max-running-requests: 128 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + moe-dense-tp-size: 1 + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 9300 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + chunked-prefill-size: -1 # save mem + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 1 # save mem + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + moe-dense-tp-size: 1 + prefill-round-robin-balance: true + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + +benchmark: + type: "sa-bench" + isl: 8102 + osl: 1024 + concurrencies: [4,8] + req_rate: "inf" diff --git a/recipies/gb300-fp8/8k1k/stp/max.yaml b/recipies/gb300-fp8/8k1k/stp/max.yaml new file mode 100644 index 00000000..ee0ae2f5 --- /dev/null +++ b/recipies/gb300-fp8/8k1k/stp/max.yaml @@ -0,0 +1,170 @@ +# GB200 FP8 Max Throughput Configuration + +name: "gb300-8k1k-fp8-max" + +model: + path: "dsfp8" + container: "sglang0p5p8_cu13" + precision: "fp8" + +extra_mount: # add this if you need to mount extra directories to the container + - "/lustre:/lustre" +resources: + gpu_type: "gb300" + prefill_nodes: 12 + prefill_workers: 6 + decode_nodes: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 24 + dp-size: 24 + ep-size: 24 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 9300 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [2048,4096,10240] + req_rate: "inf" + diff --git a/recipies/gb300-fp8/8k1k/stp/mid.yaml b/recipies/gb300-fp8/8k1k/stp/mid.yaml new file mode 100644 index 00000000..9842c8aa --- /dev/null +++ b/recipies/gb300-fp8/8k1k/stp/mid.yaml @@ -0,0 +1,170 @@ +# GB200 FP8 Max Throughput Configuration + +name: "gb300-8k1k-fp8-mid" + +model: + path: "dsfp8" + container: "sglang0p5p8_cu13" + precision: "fp8" + +extra_mount: # add this if you need to mount extra directories to the container + - "/lustre:/lustre" +resources: + gpu_type: "gb300" + prefill_nodes: 10 + prefill_workers: 5 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 9300 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [128,256,512,1024,2048,4096] + req_rate: "inf" + From bb7c6171a6b1d5fdea00aed20b209aa1b3c33cce Mon Sep 17 00:00:00 2001 From: Grace Ho Date: Thu, 29 Jan 2026 13:08:18 -0800 Subject: [PATCH 2/2] typos from coderabbit addressed --- recipies/gb300-fp8/1k1k/stp/max.yaml | 2 +- recipies/gb300-fp8/1k1k/stp/mid.yaml | 2 +- recipies/gb300-fp8/8k1k/stp/low-latency.yaml | 2 +- recipies/gb300-fp8/8k1k/stp/max.yaml | 2 +- recipies/gb300-fp8/8k1k/stp/mid.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/recipies/gb300-fp8/1k1k/stp/max.yaml b/recipies/gb300-fp8/1k1k/stp/max.yaml index 2d7c65dd..664a9875 100644 --- a/recipies/gb300-fp8/1k1k/stp/max.yaml +++ b/recipies/gb300-fp8/1k1k/stp/max.yaml @@ -1,4 +1,4 @@ -# GB200 FP8 Max Throughput Configuration +# GB300 FP8 Max Throughput Configuration name: "gb300-1k1k-fp8-max" diff --git a/recipies/gb300-fp8/1k1k/stp/mid.yaml b/recipies/gb300-fp8/1k1k/stp/mid.yaml index 77df3500..5131c692 100644 --- a/recipies/gb300-fp8/1k1k/stp/mid.yaml +++ b/recipies/gb300-fp8/1k1k/stp/mid.yaml @@ -1,4 +1,4 @@ -# GB200 FP8 Max Throughput Configuration +# GB300 FP8 Mid Throughput Configuration name: "gb300-1k1k-fp8-mid" model: diff --git a/recipies/gb300-fp8/8k1k/stp/low-latency.yaml b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml index 27300e8b..07a7d781 100644 --- a/recipies/gb300-fp8/8k1k/stp/low-latency.yaml +++ b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml @@ -117,7 +117,7 @@ backend: benchmark: type: "sa-bench" - isl: 8102 + isl: 8192 osl: 1024 concurrencies: [4,8] req_rate: "inf" diff --git a/recipies/gb300-fp8/8k1k/stp/max.yaml b/recipies/gb300-fp8/8k1k/stp/max.yaml index ee0ae2f5..010bed5e 100644 --- a/recipies/gb300-fp8/8k1k/stp/max.yaml +++ b/recipies/gb300-fp8/8k1k/stp/max.yaml @@ -1,4 +1,4 @@ -# GB200 FP8 Max Throughput Configuration +# GB300 FP8 Max Throughput Configuration name: "gb300-8k1k-fp8-max" diff --git a/recipies/gb300-fp8/8k1k/stp/mid.yaml b/recipies/gb300-fp8/8k1k/stp/mid.yaml index 9842c8aa..f7f16b54 100644 --- a/recipies/gb300-fp8/8k1k/stp/mid.yaml +++ b/recipies/gb300-fp8/8k1k/stp/mid.yaml @@ -1,4 +1,4 @@ -# GB200 FP8 Max Throughput Configuration +# GB300 FP8 Mid Throughput Configuration name: "gb300-8k1k-fp8-mid"