diff --git a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml new file mode 100644 index 00000000..a948593c --- /dev/null +++ b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml @@ -0,0 +1,171 @@ +base: + name: "dsv4-pro-gb300-fp4" + + slurm: + partition: gb300 + time_limit: "03:00:00" + + frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + + model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + + resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + + # extra_mount: + # - /mnt/home/weiliang/project/sglang:/sgl-workspace/sglang + + # setup_script: "install_sglang.sh" + + backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "576" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + # Parallel + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + # Decode: DEP8 (2 nodes) + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Lower mfs on decode: DEP8 weight memory + KV pool both grow ~2x + # vs DEP4, so 0.83 leaves enough headroom for cuda-graph capture + # at cgmb=2048. + mem-fraction-static: 0.9 + max-running-requests: 8192 + cuda-graph-max-bs: 8192 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + + benchmark: + type: "sa-bench" + concurrencies: "8192" + use_chat_template: false + +############ 8k1k ############## +# [0]is wideep, [1] is narrow ep +zip_override_8k1k_hightpt: + resources: + prefill_nodes: [14, 1] + prefill_workers: [14, 1] + decode_nodes: [4, 2] + decode_workers: [1, 1] + backend: + sglang_config: + decode: + tensor-parallel-size: [16, 8] + data-parallel-size: [16, 8] + expert-parallel-size: [16, 8] + + enable-dp-attention: true + enable-dp-lm-head: true + + ep-num-redundant-experts: [16, null] + ep-dispatch-algorithm: ["static", null] + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + max-running-requests: [9216, 256] + cuda-graph-max-bs: [576, 32] + + benchmark: + isl: 8192 + osl: 1024 + concurrencies: "8192" + +############ 1k1k ###############