From 1153beaebb671d3174cef398e9b4d14e582e9719 Mon Sep 17 00:00:00 2001 From: Weiliang Liu Date: Mon, 27 Apr 2026 04:03:34 -0700 Subject: [PATCH 1/3] Add DSV4 Pro GB300 high-throughput recipe --- recipes/dsv4-pro/sglang/gb300-fp4/all.yaml | 157 +++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 recipes/dsv4-pro/sglang/gb300-fp4/all.yaml diff --git a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml new file mode 100644 index 00000000..1ce2fdf8 --- /dev/null +++ b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml @@ -0,0 +1,157 @@ +base: + name: "dsv4-pro-gb300-fp4" + + slurm: + partition: gb300 + time_limit: "03:00:00" + + frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + + model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + + resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + + backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + # Parallel + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 65536 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + # Decode: DEP8 (2 nodes) + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Lower mfs on decode: DEP8 weight memory + KV pool both grow ~2x + # vs DEP4, so 0.83 leaves enough headroom for cuda-graph capture + # at cgmb=2048. + mem-fraction-static: 0.83 + max-running-requests: 4096 + cuda-graph-max-bs: 4096 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + + benchmark: + type: "sa-bench" + concurrencies: "8192" + use_chat_template: false + +############ 8k1k ############## +# [0]is wideep, [1] is narrow ep +zip_override_8k1k_hightpt: + resources: + prefill_nodes: [5, 1] + prefill_workers: [5, 1] + decode_nodes: [8, 2] + decode_workers: [1, 1] + backend: + sglang_config: + decode: + tensor-parallel-size: [32, 8] + data-parallel-size: [32, 8] + expert-parallel-size: [32, 8] + + enable-dp-attention: true + ep-num-redundant-experts: [32, null] + ep-dispatch-algorithm: ["static", null] + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + max-running-requests: [2048, 256] + cuda-graph-max-bs: [2048, 256] + + benchmark: + isl: 8192 + osl: 1024 + concurrencies: "4096" + +############ 1k1k ############### From 0e1608accd5394e9f877615df13e685100a6b0ab Mon Sep 17 00:00:00 2001 From: Weiliang Liu Date: Mon, 27 Apr 2026 07:21:22 -0700 Subject: [PATCH 2/3] fix wideep oom --- recipes/dsv4-pro/sglang/gb300-fp4/all.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml index 1ce2fdf8..a6326fff 100644 --- a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml +++ b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml @@ -114,7 +114,7 @@ base: # Lower mfs on decode: DEP8 weight memory + KV pool both grow ~2x # vs DEP4, so 0.83 leaves enough headroom for cuda-graph capture # at cgmb=2048. - mem-fraction-static: 0.83 + mem-fraction-static: 0.75 max-running-requests: 4096 cuda-graph-max-bs: 4096 swa-full-tokens-ratio: 0.1 @@ -141,6 +141,7 @@ zip_override_8k1k_hightpt: expert-parallel-size: [32, 8] enable-dp-attention: true + enable-dp-lm-head: true ep-num-redundant-experts: [32, null] ep-dispatch-algorithm: ["static", null] moe-a2a-backend: "deepep" From 051024e73f258bf9b8019c17c6ca79b51a976e2b Mon Sep 17 00:00:00 2001 From: Weiliang Liu Date: Mon, 27 Apr 2026 08:25:54 -0700 Subject: [PATCH 3/3] optimize perf --- recipes/dsv4-pro/sglang/gb300-fp4/all.yaml | 47 ++++++++++++++-------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml index a6326fff..a948593c 100644 --- a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml +++ b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml @@ -24,6 +24,11 @@ base: decode_nodes: 2 decode_workers: 1 + # extra_mount: + # - /mnt/home/weiliang/project/sglang:/sgl-workspace/sglang + + # setup_script: "install_sglang.sh" + backend: type: sglang @@ -33,6 +38,7 @@ base: SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" @@ -41,14 +47,16 @@ base: SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" - + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" decode_environment: PYTHONUNBUFFERED: "1" @@ -56,6 +64,7 @@ base: SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" @@ -63,13 +72,16 @@ base: SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "576" SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 # is single-node only and corrupts results in 2-node decode setups. @@ -94,7 +106,7 @@ base: mem-fraction-static: 0.90 max-running-requests: 512 cuda-graph-max-bs: 512 - chunked-prefill-size: 65536 + chunked-prefill-size: 32768 disable-radix-cache: true decode: @@ -114,9 +126,9 @@ base: # Lower mfs on decode: DEP8 weight memory + KV pool both grow ~2x # vs DEP4, so 0.83 leaves enough headroom for cuda-graph capture # at cgmb=2048. - mem-fraction-static: 0.75 - max-running-requests: 4096 - cuda-graph-max-bs: 4096 + mem-fraction-static: 0.9 + max-running-requests: 8192 + cuda-graph-max-bs: 8192 swa-full-tokens-ratio: 0.1 context-length: 16384 @@ -129,30 +141,31 @@ base: # [0]is wideep, [1] is narrow ep zip_override_8k1k_hightpt: resources: - prefill_nodes: [5, 1] - prefill_workers: [5, 1] - decode_nodes: [8, 2] + prefill_nodes: [14, 1] + prefill_workers: [14, 1] + decode_nodes: [4, 2] decode_workers: [1, 1] backend: sglang_config: decode: - tensor-parallel-size: [32, 8] - data-parallel-size: [32, 8] - expert-parallel-size: [32, 8] + tensor-parallel-size: [16, 8] + data-parallel-size: [16, 8] + expert-parallel-size: [16, 8] enable-dp-attention: true enable-dp-lm-head: true - ep-num-redundant-experts: [32, null] + + ep-num-redundant-experts: [16, null] ep-dispatch-algorithm: ["static", null] moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - max-running-requests: [2048, 256] - cuda-graph-max-bs: [2048, 256] + max-running-requests: [9216, 256] + cuda-graph-max-bs: [576, 32] benchmark: isl: 8192 osl: 1024 - concurrencies: "4096" + concurrencies: "8192" ############ 1k1k ###############