diff --git a/configs/rebuild-deepep.sh b/configs/rebuild-deepep.sh new file mode 100644 index 00000000..d8edc142 --- /dev/null +++ b/configs/rebuild-deepep.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -eux + +echo "=== Rebuilding DeepEP with kNumMaxTopK=16 for Qwen3.5 (topk=10) ===" + +DEEPEP_SRC="/sgl-workspace/DeepEP" + +if [ ! -d "$DEEPEP_SRC" ]; then + echo "ERROR: DeepEP source not found at $DEEPEP_SRC (mount via extra_mount)" + exit 1 +fi + +cd "$DEEPEP_SRC" + +# Find NVSHMEM +NVSHMEM_DIR=$(find /usr/local -name "nvshmem" -type d 2>/dev/null | head -1) +if [ -z "${NVSHMEM_DIR:-}" ]; then + echo "ERROR: NVSHMEM installation not found under /usr/local" >&2 + exit 1 +fi +echo "NVSHMEM_DIR=$NVSHMEM_DIR" + +# Fix missing nvshmem symlinks (container has .so.3 but not .so) +NVSHMEM_LIB="$NVSHMEM_DIR/lib" +if [ ! -f "$NVSHMEM_LIB/libnvshmem_host.so" ] && [ -f "$NVSHMEM_LIB/libnvshmem_host.so.3" ]; then + echo "Creating missing nvshmem symlinks..." + ln -sf libnvshmem_host.so.3 "$NVSHMEM_LIB/libnvshmem_host.so" +fi + +# Apply kNumMaxTopK=16 patch (Qwen3.5 uses topk=10, default kNumMaxTopK=8 is insufficient) +# Note: source has both kNumMaxTopK (uppercase) and kNumMaxTopk (lowercase) as separate variables +sed -i 's/kNumMaxTopK[[:space:]]*=[[:space:]]*[0-9][0-9]*/kNumMaxTopK = 16/g' csrc/kernels/internode_ll.cu +sed -i 's/kNumMaxTopk[[:space:]]*=[[:space:]]*[0-9][0-9]*/kNumMaxTopk = 16/g' csrc/kernels/internode_ll.cu + +# Verify the patch was applied +grep -q "kNumMaxTop. = 16" csrc/kernels/internode_ll.cu && echo "Patch verified: kNumMaxTopK/k=16" || { + echo "ERROR: kNumMaxTopK patch failed to apply!"; exit 1; +} + +# Build with full output so we can debug failures +# set -e will auto-exit on failure +TORCH_CUDA_ARCH_LIST="10.0" \ +NVSHMEM_DIR="$NVSHMEM_DIR" \ +pip install -e . --no-build-isolation 2>&1 + +echo "=== DeepEP rebuild complete ===" +python3 -c "import deep_ep; print('deep_ep imported successfully')" diff --git a/recipes/qwen3.5/1p1d-dep4-dep4.yaml b/recipes/qwen3.5/1p1d-dep4-dep4.yaml new file mode 100644 index 00000000..a76d2f90 --- /dev/null +++ b/recipes/qwen3.5/1p1d-dep4-dep4.yaml @@ -0,0 +1,126 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + DEP4 Decode +# Both sides use Data Expert Parallel (DP4 + TP4 + EP4) with dp-attention +# Homogeneous TP layout to avoid KV/Mamba state slice transfer overhead + +name: "qwen3.5-1p1d-dep4-dep4" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # DEP4: DP4 + TP4 + EP4 with dp-attention (same layout as decode) + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-decode-tp: 4 + disaggregation-decode-dp: 4 + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + watchdog-timeout: 1000000 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/1p1d-tep4-tep4.yaml b/recipes/qwen3.5/1p1d-tep4-tep4.yaml new file mode 100644 index 00000000..90f8ad37 --- /dev/null +++ b/recipes/qwen3.5/1p1d-tep4-tep4.yaml @@ -0,0 +1,115 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TEP4 Prefill + TEP4 Decode +# Both sides use Tensor Expert Parallel (TP4 + EP4), no dp-attention + +name: "qwen3.5-1p1d-tep4-tep4" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # TEP4: TP4 + EP4, standard TP attention (no dp-attention) + tensor-parallel-size: 4 + expert-parallel-size: 4 + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-decode-tp: 4 + disaggregation-decode-dp: 1 + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # TEP4: TP4 + EP4, standard TP attention (no dp-attention) + tensor-parallel-size: 4 + expert-parallel-size: 4 + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + + mem-fraction-static: 0.70 + chunked-prefill-size: 16384 + context-length: 2020 + watchdog-timeout: 1000000 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "8x32x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/1p1d-tp4-tp4-mtp.yaml b/recipes/qwen3.5/1p1d-tp4-tp4-mtp.yaml new file mode 100644 index 00000000..22b98c8b --- /dev/null +++ b/recipes/qwen3.5/1p1d-tp4-tp4-mtp.yaml @@ -0,0 +1,114 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode + MTP (EAGLE spec dec) +# Pure tensor parallel, no expert parallel, with speculative decoding + +name: "qwen3.5-1p1d-tp4-tp4-mtp" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-decode-tp: 4 + disaggregation-decode-dp: 1 + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + disaggregation-mode: "decode" + disable-radix-cache: true + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2020 + watchdog-timeout: 1000000 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "8x32x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/1p1d-tp4-tp4.yaml b/recipes/qwen3.5/1p1d-tp4-tp4.yaml new file mode 100644 index 00000000..dfb1776e --- /dev/null +++ b/recipes/qwen3.5/1p1d-tp4-tp4.yaml @@ -0,0 +1,109 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode +# Pure tensor parallel, no expert parallel + +name: "qwen3.5-1p1d-tp4-tp4" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-decode-tp: 4 + disaggregation-decode-dp: 1 + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2020 + watchdog-timeout: 1000000 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "8x32x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/agg-dep4.yaml b/recipes/qwen3.5/agg-dep4.yaml new file mode 100644 index 00000000..cbd1ea82 --- /dev/null +++ b/recipes/qwen3.5/agg-dep4.yaml @@ -0,0 +1,66 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated DEP4 on GB200 +# Data Expert Parallel: DP4 + TP4 + EP4 with dp-attention + +name: "qwen3.5-agg-dep4" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + # enable-symm-mem: true + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/agg-tep4.yaml b/recipes/qwen3.5/agg-tep4.yaml new file mode 100644 index 00000000..27d73327 --- /dev/null +++ b/recipes/qwen3.5/agg-tep4.yaml @@ -0,0 +1,63 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated TEP4 on GB200 +# Tensor Expert Parallel: TP4 + EP4, no dp-attention + +name: "qwen3.5-agg-tep4" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # TEP4: TP4 + EP4, standard TP attention (no dp-attention) + tensor-parallel-size: 4 + expert-parallel-size: 4 + moe-dense-tp-size: 1 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + # enable-symm-mem: true + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/agg-tp4.yaml b/recipes/qwen3.5/agg-tp4.yaml new file mode 100644 index 00000000..4514409a --- /dev/null +++ b/recipes/qwen3.5/agg-tp4.yaml @@ -0,0 +1,59 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated TP4 on GB200 +# Pure tensor parallel, no expert parallel + +name: "qwen3.5-agg-tp4" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + # enable-symm-mem: true + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml b/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml new file mode 100644 index 00000000..b2ccd6e1 --- /dev/null +++ b/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml @@ -0,0 +1,147 @@ +# Qwen3.5-397B-A17B-FP8 1P1D: Prefill TP4 (1 node) + Decode DeepEP TP8/EP8 (2 nodes) +# Total 3 nodes: prefill simple, decode wide-EP with low_latency +# Performance is un-verified. Accuracy testing in progress. +# Purpose: accuracy verification (not for pareto benchmarking) + +name: "qwen3.5-1p1d-tp4-deepep-deepgemm" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +setup_script: "rebuild-deepep.sh" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + # Prefill: 1 node TP4 (no EP), Decode: 2 nodes TP8/EP8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # Parallelism: simple TP4 on 1 node, no EP + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Mamba hybrid model settings + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + # PD disaggregation + disaggregation-mode: "prefill" + disable-radix-cache: true + + # Memory: same as AGG/1p1d configs + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + # context-length: 2020 # commented out because of accuracy test + + # Tell prefill about decode's actual TP/DP sizes for correct KV transfer + disaggregation-decode-tp: 8 + disaggregation-decode-dp: 8 + + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # Parallelism: TP8/EP8 across 2 GB200 nodes + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + + # Mamba hybrid model settings + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + # PD disaggregation + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 30001 + + mem-fraction-static: 0.60 + + max-mamba-cache-size: 3200 + max-running-requests: 640 + chunked-prefill-size: 16384 + context-length: 2020 + cuda-graph-max-bs: 128 + watchdog-timeout: 1000000 + + # DeepEP: low_latency for decode (requires rebuilt DeepEP with kNumMaxTopK=16) + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + + # DeepGemm for MoE GEMM + moe-runner-backend: "deep_gemm" + + # Workload balance + eplb-algorithm: "deepseek" + +benchmark: + type: "gpqa" + num_examples: 198 + max_tokens: 65536 + repeat: 8 + num_threads: 128 diff --git a/recipes/qwen3.5/experimental/1p1d-tp4-dep4.yaml b/recipes/qwen3.5/experimental/1p1d-tp4-dep4.yaml new file mode 100644 index 00000000..a1766ff5 --- /dev/null +++ b/recipes/qwen3.5/experimental/1p1d-tp4-dep4.yaml @@ -0,0 +1,117 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + DEP4 Decode +# Decode: Data Expert Parallel (DP4 + TP4 + EP4) with dp-attention + +name: "qwen3.5-1p1d-tp4-dep4" + +model: + path: "qwen3.5-fp8" + container: "dev" # docker://lmsysorg/sglang:dev + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-decode-tp: 4 + disaggregation-decode-dp: 4 + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2020 + watchdog-timeout: 1000000 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "8x32x128x256x512x1024" + req_rate: "inf"