From 0c90137c10f7c36d72a8ebe1eec20e0c342e022f Mon Sep 17 00:00:00 2001 From: Kyle Liang Date: Mon, 2 Feb 2026 14:31:25 -0800 Subject: [PATCH 1/6] Update GB200-FP8 configs --- recipes/gb200-fp8/1k1k/low-latency.yaml | 38 ++-- .../1k1k/{max-tpt-2p1d.yaml => max-tpt.yaml} | 25 ++- .../{mid-curve-3p1d.yaml => mid-curve.yaml} | 28 ++- recipes/gb200-fp8/1k8k/low_latency.yaml | 124 ++++++++++++ recipes/gb200-fp8/1k8k/max_tpt.yaml | 182 ++++++++++++++++++ recipes/gb200-fp8/1k8k/mid_curve.yaml | 173 +++++++++++++++++ recipes/gb200-fp8/8k1k/low-latency.yaml | 41 ++-- recipes/gb200-fp8/8k1k/max_tpt.yaml | 174 +++++++++++++++++ .../{mid-curve-5p1d.yaml => mid-curve.yaml} | 23 ++- 9 files changed, 753 insertions(+), 55 deletions(-) rename recipes/gb200-fp8/1k1k/{max-tpt-2p1d.yaml => max-tpt.yaml} (90%) rename recipes/gb200-fp8/1k1k/{mid-curve-3p1d.yaml => mid-curve.yaml} (90%) create mode 100644 recipes/gb200-fp8/1k8k/low_latency.yaml create mode 100644 recipes/gb200-fp8/1k8k/max_tpt.yaml create mode 100644 recipes/gb200-fp8/1k8k/mid_curve.yaml create mode 100644 recipes/gb200-fp8/8k1k/max_tpt.yaml rename recipes/gb200-fp8/8k1k/{mid-curve-5p1d.yaml => mid-curve.yaml} (89%) diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml index 7ce9daf3..11151232 100644 --- a/recipes/gb200-fp8/1k1k/low-latency.yaml +++ b/recipes/gb200-fp8/1k1k/low-latency.yaml @@ -1,16 +1,24 @@ -name: "gb200-fp8-1p-4d-low-latency" +name: "gb200-fp8-1k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp8" resources: gpu_type: "gb200" prefill_nodes: 1 - decode_nodes: 4 + decode_nodes: 1 prefill_workers: 1 - decode_workers: 4 + decode_workers: 1 gpus_per_node: 4 backend: @@ -18,9 +26,8 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" @@ -36,14 +43,13 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" @@ -72,12 +78,14 @@ backend: max-running-requests: 512 load-balance-method: "round_robin" scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true + fp8-gemm-backend: "flashinfer_trtllm" enable-symm-mem: true moe-dense-tp-size: 1 tensor-parallel-size: 4 data-parallel-size: 1 expert-parallel-size: 1 + + disaggregation-transfer-backend: nixl decode: served-model-name: "deepseek-ai/DeepSeek-R1" @@ -94,19 +102,21 @@ backend: mem-fraction-static: 0.95 chunked-prefill-size: 8192 cuda-graph-max-bs: 128 - max-running-requests: 512 + max-running-requests: 128 scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true enable-symm-mem: true moe-dense-tp-size: 1 prefill-round-robin-balance: true tensor-parallel-size: 4 data-parallel-size: 1 expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x8x32x64x80x96x112x128" - req_rate: "inf" \ No newline at end of file + concurrencies: "4x8" + req_rate: "inf" diff --git a/recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml b/recipes/gb200-fp8/1k1k/max-tpt.yaml similarity index 90% rename from recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml rename to recipes/gb200-fp8/1k1k/max-tpt.yaml index e1859cec..14f43d2a 100644 --- a/recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml +++ b/recipes/gb200-fp8/1k1k/max-tpt.yaml @@ -1,10 +1,16 @@ -# GB200 FP8 Max Throughput Configuration +name: "gb200-fp8-1k1k-max-tpt" -name: "gb200-fp8-max-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp8" resources: @@ -20,7 +26,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -37,7 +43,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" @@ -45,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -106,6 +111,8 @@ backend: ep-num-redundant-experts: 32 deepep-config: "/configs/deepep_config.json" + disaggregation-transfer-backend: nixl + decode: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" @@ -156,10 +163,12 @@ backend: cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] cuda-graph-max-bs: 768 + disaggregation-transfer-backend: nixl + benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1024x2048x4096" + concurrencies: "1024x2048x4096x6144" req_rate: "inf" diff --git a/recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml b/recipes/gb200-fp8/1k1k/mid-curve.yaml similarity index 90% rename from recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml rename to recipes/gb200-fp8/1k1k/mid-curve.yaml index 36bbfb7e..5ea1a036 100644 --- a/recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml +++ b/recipes/gb200-fp8/1k1k/mid-curve.yaml @@ -1,10 +1,16 @@ -# GB200 FP8 Max Throughput Configuration +name: "gb200-fp8-1k1k-mid-curve" -name: "gb200-fp8-max-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp8" resources: @@ -20,7 +26,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -37,7 +43,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" @@ -45,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -105,6 +110,7 @@ backend: enable-dp-lm-head: true ep-num-redundant-experts: 32 deepep-config: "/configs/deepep_config.json" + disaggregation-transfer-backend: nixl decode: # Model configuration @@ -113,9 +119,9 @@ backend: trust-remote-code: true # Parallelism - tp-size: 32 - dp-size: 32 - ep-size: 32 + tp-size: 48 + dp-size: 48 + ep-size: 48 enable-dp-attention: true # KV cache and attention @@ -155,6 +161,8 @@ backend: # CUDA graphs cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] cuda-graph-max-bs: 768 + disaggregation-transfer-backend: nixl + benchmark: type: "sa-bench" diff --git a/recipes/gb200-fp8/1k8k/low_latency.yaml b/recipes/gb200-fp8/1k8k/low_latency.yaml new file mode 100644 index 00000000..6d3e893c --- /dev/null +++ b/recipes/gb200-fp8/1k8k/low_latency.yaml @@ -0,0 +1,124 @@ +name: "gb200-fp8-1k8k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true # Enable nginx + multiple routers + num_additional_frontends: 2 # Additional routers (total = 1 + t + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + disaggregation-mode: "prefill" + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 512 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false + fp8-gemm-backend: "flashinfer_trtllm" + enable-symm-mem: true + moe-dense-tp-size: 1 + disaggregation-bootstrap-port: 30001 + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 10000 + disaggregation-mode: "decode" + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false + enable-symm-mem: false #true + moe-dense-tp-size: 1 + disaggregation-bootstrap-port: 30001 + prefill-round-robin-balance: true + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "4x8x32x64x80x96x112x128" + req_rate: "inf" diff --git a/recipes/gb200-fp8/1k8k/max_tpt.yaml b/recipes/gb200-fp8/1k8k/max_tpt.yaml new file mode 100644 index 00000000..8322b8b9 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/max_tpt.yaml @@ -0,0 +1,182 @@ +name: "gb200-fp8-1k1k-max-tpt" + +extra_mount: # add this if you need to mount extra directories to the container + - "/lustre:/lustre" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true # Enable nginx + multiple routers + num_additional_frontends: 9 # Additional routers (total = 1 + t + +model: + path: "dsfp8" +# container: "sglang0p5p5ppost2" +# container: "sglang0p5p7" +# container: "sglang0p5p8" + container: "sglang0p5p8_cu13" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + prefill_workers: 2 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" +# SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" +# SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1024x2048x4096" + req_rate: "inf" + diff --git a/recipes/gb200-fp8/1k8k/mid_curve.yaml b/recipes/gb200-fp8/1k8k/mid_curve.yaml new file mode 100644 index 00000000..19737dc4 --- /dev/null +++ b/recipes/gb200-fp8/1k8k/mid_curve.yaml @@ -0,0 +1,173 @@ +name: "gb200-fp8-1k8k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + prefill_workers: 3 + decode_nodes: 12 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 10000 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + disaggregation-transfer-backend: nixl + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 8192 + concurrencies: "1024x2048x4096" + req_rate: "inf" + diff --git a/recipes/gb200-fp8/8k1k/low-latency.yaml b/recipes/gb200-fp8/8k1k/low-latency.yaml index 52ea1d89..56587e29 100644 --- a/recipes/gb200-fp8/8k1k/low-latency.yaml +++ b/recipes/gb200-fp8/8k1k/low-latency.yaml @@ -1,14 +1,22 @@ -name: "gb200-fp8-8k1k-1p-1d-low-latency" +name: "gb200-fp8-8k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true # Enable nginx + multiple routers + num_additional_frontends: 2 # Additional routers (total = 1 + t model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp8" resources: gpu_type: "gb200" - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 gpus_per_node: 4 @@ -18,9 +26,8 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" @@ -36,14 +43,12 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" @@ -64,18 +69,20 @@ backend: watchdog-timeout: 1000000 context-length: 9600 disaggregation-mode: "prefill" - mem-fraction-static: 0.95 + mem-fraction-static: 0.8 max-total-tokens: 32768 chunked-prefill-size: 24576 cuda-graph-max-bs: 512 max-running-requests: 512 load-balance-method: "round_robin" scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true moe-dense-tp-size: 1 - tensor-parallel-size: 4 + tensor-parallel-size: 8 data-parallel-size: 1 expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl decode: served-model-name: "deepseek-ai/DeepSeek-R1" @@ -88,18 +95,20 @@ backend: watchdog-timeout: 1000000 context-length: 9600 disaggregation-mode: "decode" - mem-fraction-static: 0.95 + mem-fraction-static: 0.8 chunked-prefill-size: 8192 cuda-graph-max-bs: 512 max-running-requests: 512 scheduler-recv-interval: 10 - enable-flashinfer-allreduce-fusion: true enable-symm-mem: true moe-dense-tp-size: 1 prefill-round-robin-balance: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 data-parallel-size: 1 expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml new file mode 100644 index 00000000..0d90dddb --- /dev/null +++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml @@ -0,0 +1,174 @@ +name: "gb200-8k1k-fp8-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 12 + prefill_workers: 6 + decode_nodes: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.80 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 24 + dp-size: 24 + ep-size: 24 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 8192 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512] + cuda-graph-max-bs: 512 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048x4096x6144x8192x10240" + req_rate: "300" diff --git a/recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml similarity index 89% rename from recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml rename to recipes/gb200-fp8/8k1k/mid-curve.yaml index 4c6fff6d..24d12de1 100644 --- a/recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml +++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml @@ -1,10 +1,16 @@ -# GB200 FP8 Mid curve Configuration - name: "gb200-8k1k-fp8-mid-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + model: - path: "dsfp8" - container: "0.5.5.post2" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp8" resources: @@ -20,7 +26,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -37,7 +43,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "256" MC_TE_METRIC: "true" @@ -45,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -105,6 +110,8 @@ backend: enable-dp-lm-head: true ep-num-redundant-experts: 32 deepep-config: "/configs/deepep_config.json" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl decode: # Model configuration @@ -153,6 +160,8 @@ backend: deepep-config: "/configs/deepep_config.json" # CUDA graphs cuda-graph-max-bs: 256 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" From acb398b5dfc325677808cae17cc2fec3ba80262f Mon Sep 17 00:00:00 2001 From: Kyle Liang Date: Mon, 2 Feb 2026 14:44:51 -0800 Subject: [PATCH 2/6] Update GB200-FP4 configs --- recipes/gb200-fp4/1k1k/low-latency.yaml | 24 +++++++++++++++--------- recipes/gb200-fp4/1k1k/max-tpt.yaml | 22 ++++++++++++++-------- recipes/gb200-fp4/1k1k/mid-curve.yaml | 23 ++++++++++++++--------- recipes/gb200-fp4/1k8k/low-latency.yaml | 24 +++++++++++++----------- recipes/gb200-fp4/1k8k/max-tpt.yaml | 15 ++++++++------- recipes/gb200-fp4/1k8k/mid-curve.yaml | 20 ++++++++++---------- recipes/gb200-fp4/8k1k/low-latency.yaml | 23 +++++++++++++++-------- recipes/gb200-fp4/8k1k/max-tpt.yaml | 22 +++++++++++++++------- recipes/gb200-fp4/8k1k/mid-curve.yaml | 22 +++++++++++++++------- 9 files changed, 119 insertions(+), 76 deletions(-) diff --git a/recipes/gb200-fp4/1k1k/low-latency.yaml b/recipes/gb200-fp4/1k1k/low-latency.yaml index b27f67ec..bdb3cb6f 100644 --- a/recipes/gb200-fp4/1k1k/low-latency.yaml +++ b/recipes/gb200-fp4/1k1k/low-latency.yaml @@ -1,8 +1,16 @@ -name: "gb200-fp4-1p2d" +name: "gb200-fp4-1k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,14 +33,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" @@ -44,14 +49,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" sglang_config: prefill: @@ -76,6 +78,8 @@ backend: moe-dense-tp-size: 1 load-balance-method: "round_robin" disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" data-parallel-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 @@ -100,6 +104,8 @@ backend: scheduler-recv-interval: 10 enable-symm-mem: true moe-dense-tp-size: 1 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" tensor-parallel-size: 4 expert-parallel-size: 1 @@ -108,4 +114,4 @@ benchmark: isl: 1024 osl: 1024 concurrencies: "4x8x32x64x112x128x256" - req_rate: "inf" \ No newline at end of file + req_rate: "inf" diff --git a/recipes/gb200-fp4/1k1k/max-tpt.yaml b/recipes/gb200-fp4/1k1k/max-tpt.yaml index ba31ccfc..2cf26114 100644 --- a/recipes/gb200-fp4/1k1k/max-tpt.yaml +++ b/recipes/gb200-fp4/1k1k/max-tpt.yaml @@ -1,10 +1,16 @@ -# 4P1D, with 12 Decode Nodes. Uses single batch overlap +name: "gb200-fp4-1k1k-max-tpt" -name: "gb200-fp4-max-tpt" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -27,7 +33,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -46,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -56,8 +60,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -103,6 +105,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -162,6 +166,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 48 @@ -173,4 +179,4 @@ benchmark: isl: 1024 osl: 1024 concurrencies: "1x128x512x2048x4096x8192x12000x15000" - req_rate: "inf" \ No newline at end of file + req_rate: "inf" diff --git a/recipes/gb200-fp4/1k1k/mid-curve.yaml b/recipes/gb200-fp4/1k1k/mid-curve.yaml index 2365f2c0..277cc2c4 100644 --- a/recipes/gb200-fp4/1k1k/mid-curve.yaml +++ b/recipes/gb200-fp4/1k1k/mid-curve.yaml @@ -1,11 +1,16 @@ -# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher -# per gpu throughput +name: "gb200-fp4-1k1k-mid-curve" -name: "gb200-fp4-max-tpt-2" +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -28,7 +33,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -47,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -57,8 +60,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -104,6 +105,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -162,6 +165,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 32 @@ -173,4 +178,4 @@ benchmark: isl: 1024 osl: 1024 concurrencies: "1x128x512x2048x4096x8192x12000x15000" - req_rate: "inf" \ No newline at end of file + req_rate: "inf" diff --git a/recipes/gb200-fp4/1k8k/low-latency.yaml b/recipes/gb200-fp4/1k8k/low-latency.yaml index 6c2a9536..10944923 100644 --- a/recipes/gb200-fp4/1k8k/low-latency.yaml +++ b/recipes/gb200-fp4/1k8k/low-latency.yaml @@ -1,16 +1,16 @@ -name: "gb200-fp4-1p2d" +name: "gb200-fp4-1k8k-low-latency" dynamo: - version: 0.7.0 + version: 0.8.1 -frontend: - type: dynamo +frontend: + type: dynamo enable_multiple_frontends: true - num_additional_frontends: 4 + num_additional_frontends: 3 model: - path: "dsr1" - container: "lmsysorg/sglang:v0.5.5.post2" + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -37,7 +37,6 @@ backend: NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" @@ -54,12 +53,11 @@ backend: NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" sglang_config: prefill: - disaggregation-mode: "prefill" served-model-name: "deepseek-ai/DeepSeek-R1" + disaggregation-mode: "prefill" trust-remote-code: true disable-radix-cache: true kv-cache-dtype: "fp8_e4m3" @@ -81,10 +79,12 @@ backend: data-parallel-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl decode: - disaggregation-mode: "decode" served-model-name: "deepseek-ai/DeepSeek-R1" + disaggregation-mode: "decode" prefill-round-robin-balance: true trust-remote-code: true disable-radix-cache: true @@ -103,6 +103,8 @@ backend: moe-dense-tp-size: 1 tensor-parallel-size: 4 expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl benchmark: type: "sa-bench" diff --git a/recipes/gb200-fp4/1k8k/max-tpt.yaml b/recipes/gb200-fp4/1k8k/max-tpt.yaml index d2c46140..68bd1928 100644 --- a/recipes/gb200-fp4/1k8k/max-tpt.yaml +++ b/recipes/gb200-fp4/1k8k/max-tpt.yaml @@ -1,7 +1,7 @@ -name: "gb200-fp4-max-tpt" +name: "gb200-fp4-1k8k-max-tpt" dynamo: - version: 0.7.0 + version: 0.8.1 frontend: type: dynamo @@ -9,8 +9,8 @@ frontend: num_additional_frontends: 9 model: - path: "dsr1" - container: "lmsysorg/sglang:v0.5.5.post2" + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -32,7 +32,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -51,7 +50,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -61,7 +59,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: @@ -69,6 +66,7 @@ backend: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true + disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -108,6 +106,7 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 4 @@ -118,6 +117,7 @@ backend: # Model configuration served-model-name: "deepseek-ai/DeepSeek-R1" trust-remote-code: true + disaggregation-transfer-backend: nixl # KV cache and attention kv-cache-dtype: "fp8_e4m3" @@ -233,6 +233,7 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 48 diff --git a/recipes/gb200-fp4/1k8k/mid-curve.yaml b/recipes/gb200-fp4/1k8k/mid-curve.yaml index bf455b72..c781fc7f 100644 --- a/recipes/gb200-fp4/1k8k/mid-curve.yaml +++ b/recipes/gb200-fp4/1k8k/mid-curve.yaml @@ -1,16 +1,16 @@ -name: "gb200-fp4-mid-curve" +name: "gb200-fp4-1k8k-mid-curve" dynamo: - version: 0.7.0 + version: 0.8.1 -frontend: - type: dynamo +frontend: + type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 model: - path: "dsr1" - container: "lmsysorg/sglang:v0.5.5.post2" + path: "dsfp4" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -32,7 +32,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -51,7 +50,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -61,8 +59,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -73,6 +69,7 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" + disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -108,6 +105,7 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 4 @@ -122,6 +120,7 @@ backend: # KV cache and attention kv-cache-dtype: "fp8_e4m3" attention-backend: "trtllm_mla" + disaggregation-transfer-backend: nixl # Quantization quantization: "modelopt_fp4" @@ -232,6 +231,7 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" # Parallelism tp-size: 32 diff --git a/recipes/gb200-fp4/8k1k/low-latency.yaml b/recipes/gb200-fp4/8k1k/low-latency.yaml index 73a88588..1fab3df1 100644 --- a/recipes/gb200-fp4/8k1k/low-latency.yaml +++ b/recipes/gb200-fp4/8k1k/low-latency.yaml @@ -1,8 +1,16 @@ -name: "gb200-8k1k-fp4-low-latency-1p_tp4/4d_tp4" +name: "gb200-fp4-8k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,14 +33,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" @@ -44,14 +49,11 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" - # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" SGLANG_ENABLE_JIT_DEEPGEMM: "false" - SGLANG_ENABLE_FLASHINFER_GEMM: "true" sglang_config: prefill: @@ -77,6 +79,8 @@ backend: load-balance-method: "round_robin" disaggregation-bootstrap-port: 30001 data-parallel-size: 1 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" tensor-parallel-size: 4 expert-parallel-size: 1 enable-dp-attention: false @@ -101,9 +105,12 @@ backend: scheduler-recv-interval: 10 enable-symm-mem: true moe-dense-tp-size: 1 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" tensor-parallel-size: 4 expert-parallel-size: 1 enable-dp-attention: false + benchmark: type: "sa-bench" isl: 8192 diff --git a/recipes/gb200-fp4/8k1k/max-tpt.yaml b/recipes/gb200-fp4/8k1k/max-tpt.yaml index 26b4629d..b54813ab 100644 --- a/recipes/gb200-fp4/8k1k/max-tpt.yaml +++ b/recipes/gb200-fp4/8k1k/max-tpt.yaml @@ -1,8 +1,16 @@ -name: "gb200-8k1k-fp4-max-10p_tp4/1d_dep32" +name: "gb200-fp4-8k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,7 +33,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -44,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -54,8 +60,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -99,6 +103,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -156,6 +162,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 32 @@ -166,5 +174,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1024x2048x8192" + concurrencies: "1024x2048" req_rate: 700 diff --git a/recipes/gb200-fp4/8k1k/mid-curve.yaml b/recipes/gb200-fp4/8k1k/mid-curve.yaml index f1e9bb41..95b33f54 100644 --- a/recipes/gb200-fp4/8k1k/mid-curve.yaml +++ b/recipes/gb200-fp4/8k1k/mid-curve.yaml @@ -1,8 +1,16 @@ -name: "gb200-8k1k-fp4-mid-6p_tp4/1d_dep48" +name: "gb200-fp4-8k1k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 model: path: "dsfp4" - container: "0.5.5.post2" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp4" resources: @@ -25,7 +33,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -44,7 +51,6 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" MC_TE_METRIC: "true" MC_FORCE_MNNVL: "1" NCCL_MNNVL_ENABLE: "1" @@ -54,8 +60,6 @@ backend: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" SGLANG_MOE_NVFP4_DISPATCH: "1" - SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" sglang_config: prefill: @@ -99,6 +103,8 @@ backend: # Performance optimizations disable-cuda-graph: true enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 4 @@ -156,6 +162,8 @@ backend: enable-dp-lm-head: true prefill-round-robin-balance: true enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl # Parallelism tp-size: 48 @@ -167,4 +175,4 @@ benchmark: isl: 8192 osl: 1024 concurrencies: "512x1024x2048x4096" - req_rate: 700 \ No newline at end of file + req_rate: 700 From 6fa4eb6fb02dc3cc74de848cdf41b3d42df80519 Mon Sep 17 00:00:00 2001 From: Kyle Liang Date: Mon, 2 Feb 2026 16:31:14 -0800 Subject: [PATCH 3/6] Add nginx container to all GB200-FP8 configs --- recipes/gb200-fp8/1k1k/low-latency.yaml | 1 + recipes/gb200-fp8/1k1k/max-tpt.yaml | 1 + recipes/gb200-fp8/1k1k/mid-curve.yaml | 1 + recipes/gb200-fp8/1k8k/low_latency.yaml | 7 ++++--- recipes/gb200-fp8/1k8k/max_tpt.yaml | 17 ++++++----------- recipes/gb200-fp8/1k8k/mid_curve.yaml | 3 ++- recipes/gb200-fp8/8k1k/low-latency.yaml | 7 ++++--- recipes/gb200-fp8/8k1k/max_tpt.yaml | 3 ++- recipes/gb200-fp8/8k1k/mid-curve.yaml | 1 + 9 files changed, 22 insertions(+), 19 deletions(-) diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml index 11151232..92cb098d 100644 --- a/recipes/gb200-fp8/1k1k/low-latency.yaml +++ b/recipes/gb200-fp8/1k1k/low-latency.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 2 + nginx_container: nginx model: path: "dsr1-fp8" diff --git a/recipes/gb200-fp8/1k1k/max-tpt.yaml b/recipes/gb200-fp8/1k1k/max-tpt.yaml index 14f43d2a..e20bfd84 100644 --- a/recipes/gb200-fp8/1k1k/max-tpt.yaml +++ b/recipes/gb200-fp8/1k1k/max-tpt.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsr1-fp8" diff --git a/recipes/gb200-fp8/1k1k/mid-curve.yaml b/recipes/gb200-fp8/1k1k/mid-curve.yaml index 5ea1a036..95eae698 100644 --- a/recipes/gb200-fp8/1k1k/mid-curve.yaml +++ b/recipes/gb200-fp8/1k1k/mid-curve.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsr1-fp8" diff --git a/recipes/gb200-fp8/1k8k/low_latency.yaml b/recipes/gb200-fp8/1k8k/low_latency.yaml index 6d3e893c..73eda59f 100644 --- a/recipes/gb200-fp8/1k8k/low_latency.yaml +++ b/recipes/gb200-fp8/1k8k/low_latency.yaml @@ -5,8 +5,9 @@ dynamo: frontend: type: dynamo - enable_multiple_frontends: true # Enable nginx + multiple routers - num_additional_frontends: 2 # Additional routers (total = 1 + t + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx model: path: "dsr1-fp8" @@ -120,5 +121,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 8192 - concurrencies: "4x8x32x64x80x96x112x128" + concurrencies: "4x8" req_rate: "inf" diff --git a/recipes/gb200-fp8/1k8k/max_tpt.yaml b/recipes/gb200-fp8/1k8k/max_tpt.yaml index 8322b8b9..749b1bb8 100644 --- a/recipes/gb200-fp8/1k8k/max_tpt.yaml +++ b/recipes/gb200-fp8/1k8k/max_tpt.yaml @@ -1,22 +1,17 @@ name: "gb200-fp8-1k1k-max-tpt" -extra_mount: # add this if you need to mount extra directories to the container - - "/lustre:/lustre" - dynamo: version: 0.8.1 frontend: type: dynamo - enable_multiple_frontends: true # Enable nginx + multiple routers - num_additional_frontends: 9 # Additional routers (total = 1 + t + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx model: - path: "dsfp8" -# container: "sglang0p5p5ppost2" -# container: "sglang0p5p7" -# container: "sglang0p5p8" - container: "sglang0p5p8_cu13" + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130-runtime" precision: "fp8" resources: @@ -177,6 +172,6 @@ benchmark: type: "sa-bench" isl: 1024 osl: 8192 - concurrencies: "1024x2048x4096" + concurrencies: "1024x2048x4096x6144" req_rate: "inf" diff --git a/recipes/gb200-fp8/1k8k/mid_curve.yaml b/recipes/gb200-fp8/1k8k/mid_curve.yaml index 19737dc4..452b033b 100644 --- a/recipes/gb200-fp8/1k8k/mid_curve.yaml +++ b/recipes/gb200-fp8/1k8k/mid_curve.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsr1-fp8" @@ -168,6 +169,6 @@ benchmark: type: "sa-bench" isl: 1024 osl: 8192 - concurrencies: "1024x2048x4096" + concurrencies: "1024x2048x4096x6144" req_rate: "inf" diff --git a/recipes/gb200-fp8/8k1k/low-latency.yaml b/recipes/gb200-fp8/8k1k/low-latency.yaml index 56587e29..c73c7a8f 100644 --- a/recipes/gb200-fp8/8k1k/low-latency.yaml +++ b/recipes/gb200-fp8/8k1k/low-latency.yaml @@ -5,8 +5,9 @@ dynamo: frontend: type: dynamo - enable_multiple_frontends: true # Enable nginx + multiple routers - num_additional_frontends: 2 # Additional routers (total = 1 + t + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx model: path: "dsr1-fp8" @@ -114,5 +115,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4x8x16x32" + concurrencies: "4x8x16" req_rate: "inf" diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml index 0d90dddb..b48de751 100644 --- a/recipes/gb200-fp8/8k1k/max_tpt.yaml +++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsr1-fp8" @@ -170,5 +171,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "2048x4096x6144x8192x10240" + concurrencies: "2048x4096x6144" req_rate: "300" diff --git a/recipes/gb200-fp8/8k1k/mid-curve.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml index 24d12de1..bd0fb9ef 100644 --- a/recipes/gb200-fp8/8k1k/mid-curve.yaml +++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsr1-fp8" From 55e80e9b41372fe192a32b5454d8ed3bdfd1e42f Mon Sep 17 00:00:00 2001 From: Kyle Liang Date: Mon, 2 Feb 2026 16:45:48 -0800 Subject: [PATCH 4/6] Add nginx container to GB200-FP4 configs --- recipes/gb200-fp4/1k1k/low-latency.yaml | 3 ++- recipes/gb200-fp4/1k1k/max-tpt.yaml | 3 ++- recipes/gb200-fp4/1k1k/mid-curve.yaml | 3 ++- recipes/gb200-fp4/1k8k/low-latency.yaml | 1 + recipes/gb200-fp4/1k8k/max-tpt.yaml | 3 ++- recipes/gb200-fp4/1k8k/mid-curve.yaml | 1 + recipes/gb200-fp4/8k1k/low-latency.yaml | 3 ++- recipes/gb200-fp4/8k1k/max-tpt.yaml | 3 ++- recipes/gb200-fp4/8k1k/mid-curve.yaml | 3 ++- 9 files changed, 16 insertions(+), 7 deletions(-) diff --git a/recipes/gb200-fp4/1k1k/low-latency.yaml b/recipes/gb200-fp4/1k1k/low-latency.yaml index bdb3cb6f..c953d991 100644 --- a/recipes/gb200-fp4/1k1k/low-latency.yaml +++ b/recipes/gb200-fp4/1k1k/low-latency.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 3 + nginx_container: nginx model: path: "dsfp4" @@ -113,5 +114,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x8x32x64x112x128x256" + concurrencies: "4x8x32" req_rate: "inf" diff --git a/recipes/gb200-fp4/1k1k/max-tpt.yaml b/recipes/gb200-fp4/1k1k/max-tpt.yaml index 2cf26114..4bfd6ccf 100644 --- a/recipes/gb200-fp4/1k1k/max-tpt.yaml +++ b/recipes/gb200-fp4/1k1k/max-tpt.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" @@ -178,5 +179,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1x128x512x2048x4096x8192x12000x15000" + concurrencies: "512x2048x4096" req_rate: "inf" diff --git a/recipes/gb200-fp4/1k1k/mid-curve.yaml b/recipes/gb200-fp4/1k1k/mid-curve.yaml index 277cc2c4..d87d9a5a 100644 --- a/recipes/gb200-fp4/1k1k/mid-curve.yaml +++ b/recipes/gb200-fp4/1k1k/mid-curve.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" @@ -177,5 +178,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1x128x512x2048x4096x8192x12000x15000" + concurrencies: "512x2048x4096x8192x15000" req_rate: "inf" diff --git a/recipes/gb200-fp4/1k8k/low-latency.yaml b/recipes/gb200-fp4/1k8k/low-latency.yaml index 10944923..1153983f 100644 --- a/recipes/gb200-fp4/1k8k/low-latency.yaml +++ b/recipes/gb200-fp4/1k8k/low-latency.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 3 + nginx_container: nginx model: path: "dsfp4" diff --git a/recipes/gb200-fp4/1k8k/max-tpt.yaml b/recipes/gb200-fp4/1k8k/max-tpt.yaml index 68bd1928..8d75c7be 100644 --- a/recipes/gb200-fp4/1k8k/max-tpt.yaml +++ b/recipes/gb200-fp4/1k8k/max-tpt.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" @@ -244,5 +245,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 8192 - concurrencies: "256x512x1024x2048x8192" + concurrencies: "256x512x1024x2048" req_rate: "inf" diff --git a/recipes/gb200-fp4/1k8k/mid-curve.yaml b/recipes/gb200-fp4/1k8k/mid-curve.yaml index c781fc7f..01141454 100644 --- a/recipes/gb200-fp4/1k8k/mid-curve.yaml +++ b/recipes/gb200-fp4/1k8k/mid-curve.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" diff --git a/recipes/gb200-fp4/8k1k/low-latency.yaml b/recipes/gb200-fp4/8k1k/low-latency.yaml index 1fab3df1..f274b863 100644 --- a/recipes/gb200-fp4/8k1k/low-latency.yaml +++ b/recipes/gb200-fp4/8k1k/low-latency.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 4 + nginx_container: nginx model: path: "dsfp4" @@ -115,5 +116,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4x8x32x64" + concurrencies: "4x8" req_rate: 300 diff --git a/recipes/gb200-fp4/8k1k/max-tpt.yaml b/recipes/gb200-fp4/8k1k/max-tpt.yaml index b54813ab..2164891a 100644 --- a/recipes/gb200-fp4/8k1k/max-tpt.yaml +++ b/recipes/gb200-fp4/8k1k/max-tpt.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" @@ -174,5 +175,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1024x2048" + concurrencies: "2048" req_rate: 700 diff --git a/recipes/gb200-fp4/8k1k/mid-curve.yaml b/recipes/gb200-fp4/8k1k/mid-curve.yaml index 95b33f54..43ed73fd 100644 --- a/recipes/gb200-fp4/8k1k/mid-curve.yaml +++ b/recipes/gb200-fp4/8k1k/mid-curve.yaml @@ -7,6 +7,7 @@ frontend: type: dynamo enable_multiple_frontends: true num_additional_frontends: 9 + nginx_container: nginx model: path: "dsfp4" @@ -174,5 +175,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "512x1024x2048x4096" + concurrencies: "512x2048x4096" req_rate: 700 From 6ff475f2197471bb30687cb00a987fac81179a59 Mon Sep 17 00:00:00 2001 From: Kyle Liang Date: Tue, 3 Feb 2026 13:34:16 -0800 Subject: [PATCH 5/6] Cleanup configs --- recipes/gb200-fp8/1k1k/low-latency.yaml | 4 ++-- recipes/gb200-fp8/8k1k/max_tpt.yaml | 4 ---- recipes/gb200-fp8/8k1k/mid-curve.yaml | 2 -- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml index 92cb098d..e0be8170 100644 --- a/recipes/gb200-fp8/1k1k/low-latency.yaml +++ b/recipes/gb200-fp8/1k1k/low-latency.yaml @@ -85,7 +85,7 @@ backend: tensor-parallel-size: 4 data-parallel-size: 1 expert-parallel-size: 1 - + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -112,7 +112,7 @@ backend: data-parallel-size: 1 expert-parallel-size: 1 fp8-gemm-backend: "flashinfer_trtllm" - + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml index b48de751..2032a01f 100644 --- a/recipes/gb200-fp8/8k1k/max_tpt.yaml +++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml @@ -87,8 +87,6 @@ backend: watchdog-timeout: 1000000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" - init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt" - disaggregation-bootstrap-port: 30001 # Prefill-specific mode disaggregation-mode: "prefill" @@ -142,8 +140,6 @@ backend: watchdog-timeout: 1000000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" - init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt" - disaggregation-bootstrap-port: 30001 # Decode-specific mode disaggregation-mode: "decode" diff --git a/recipes/gb200-fp8/8k1k/mid-curve.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml index bd0fb9ef..844d464b 100644 --- a/recipes/gb200-fp8/8k1k/mid-curve.yaml +++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml @@ -87,7 +87,6 @@ backend: watchdog-timeout: 1000000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" - disaggregation-bootstrap-port: 30001 # Prefill-specific mode disaggregation-mode: "prefill" @@ -141,7 +140,6 @@ backend: watchdog-timeout: 1000000 disable-shared-experts-fusion: true eplb-algorithm: "deepseek" - disaggregation-bootstrap-port: 30001 # Decode-specific mode disaggregation-mode: "decode" From 41385b5442a95e3b206abbcad472bd2923a5541a Mon Sep 17 00:00:00 2001 From: Kyle Liang Date: Tue, 3 Feb 2026 17:43:48 -0800 Subject: [PATCH 6/6] Switch to use fast DG cache compile --- recipes/gb200-fp8/1k1k/low-latency.yaml | 6 +++--- recipes/gb200-fp8/1k1k/max-tpt.yaml | 6 +++--- recipes/gb200-fp8/1k1k/mid-curve.yaml | 6 +++--- recipes/gb200-fp8/1k8k/low_latency.yaml | 6 +++--- recipes/gb200-fp8/1k8k/max_tpt.yaml | 8 +++----- recipes/gb200-fp8/1k8k/mid_curve.yaml | 6 +++--- recipes/gb200-fp8/8k1k/low-latency.yaml | 6 +++--- recipes/gb200-fp8/8k1k/max_tpt.yaml | 6 +++--- recipes/gb200-fp8/8k1k/mid-curve.yaml | 6 +++--- 9 files changed, 27 insertions(+), 29 deletions(-) diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml index e0be8170..4349af7d 100644 --- a/recipes/gb200-fp8/1k1k/low-latency.yaml +++ b/recipes/gb200-fp8/1k1k/low-latency.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" @@ -44,7 +44,7 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_ENABLE_FLASHINFER_GEMM: "1" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" diff --git a/recipes/gb200-fp8/1k1k/max-tpt.yaml b/recipes/gb200-fp8/1k1k/max-tpt.yaml index e20bfd84..2e6dfcbe 100644 --- a/recipes/gb200-fp8/1k1k/max-tpt.yaml +++ b/recipes/gb200-fp8/1k1k/max-tpt.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -44,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" diff --git a/recipes/gb200-fp8/1k1k/mid-curve.yaml b/recipes/gb200-fp8/1k1k/mid-curve.yaml index 95eae698..7b59b995 100644 --- a/recipes/gb200-fp8/1k1k/mid-curve.yaml +++ b/recipes/gb200-fp8/1k1k/mid-curve.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -44,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" diff --git a/recipes/gb200-fp8/1k8k/low_latency.yaml b/recipes/gb200-fp8/1k8k/low_latency.yaml index 73eda59f..a24ea169 100644 --- a/recipes/gb200-fp8/1k8k/low_latency.yaml +++ b/recipes/gb200-fp8/1k8k/low_latency.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" @@ -44,7 +44,7 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" diff --git a/recipes/gb200-fp8/1k8k/max_tpt.yaml b/recipes/gb200-fp8/1k8k/max_tpt.yaml index 749b1bb8..904acf89 100644 --- a/recipes/gb200-fp8/1k8k/max_tpt.yaml +++ b/recipes/gb200-fp8/1k8k/max_tpt.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,8 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" -# SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -45,8 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" -# SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" diff --git a/recipes/gb200-fp8/1k8k/mid_curve.yaml b/recipes/gb200-fp8/1k8k/mid_curve.yaml index 452b033b..5c894a80 100644 --- a/recipes/gb200-fp8/1k8k/mid_curve.yaml +++ b/recipes/gb200-fp8/1k8k/mid_curve.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -44,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" MC_TE_METRIC: "true" diff --git a/recipes/gb200-fp8/8k1k/low-latency.yaml b/recipes/gb200-fp8/8k1k/low-latency.yaml index c73c7a8f..83a12e2e 100644 --- a/recipes/gb200-fp8/8k1k/low-latency.yaml +++ b/recipes/gb200-fp8/8k1k/low-latency.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" @@ -44,7 +44,7 @@ backend: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" PYTHONUNBUFFERED: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_JIT_DEEPGEMM: "false" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml index 2032a01f..d78dafb2 100644 --- a/recipes/gb200-fp8/8k1k/max_tpt.yaml +++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -44,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" MC_TE_METRIC: "true" diff --git a/recipes/gb200-fp8/8k1k/mid-curve.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml index 844d464b..98674cc6 100644 --- a/recipes/gb200-fp8/8k1k/mid-curve.yaml +++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml @@ -11,7 +11,7 @@ frontend: model: path: "dsr1-fp8" - container: "lmsysorg/sglang:v0.5.8-cu130-runtime" + container: "lmsysorg/sglang:v0.5.8-cu130" precision: "fp8" resources: @@ -27,7 +27,7 @@ backend: # Prefill-specific environment variables prefill_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" MC_TE_METRIC: "true" SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" @@ -44,7 +44,7 @@ backend: # Decode-specific environment variables decode_environment: TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" DYN_SKIP_SGLANG_LOG_FORMATTING: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "256" MC_TE_METRIC: "true"