diff --git a/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml b/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml deleted file mode 100644 index b2ccd6e15..000000000 --- a/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml +++ /dev/null @@ -1,147 +0,0 @@ -# Qwen3.5-397B-A17B-FP8 1P1D: Prefill TP4 (1 node) + Decode DeepEP TP8/EP8 (2 nodes) -# Total 3 nodes: prefill simple, decode wide-EP with low_latency -# Performance is un-verified. Accuracy testing in progress. -# Purpose: accuracy verification (not for pareto benchmarking) - -name: "qwen3.5-1p1d-tp4-deepep-deepgemm" - -model: - path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev - precision: "fp8" - -setup_script: "rebuild-deepep.sh" - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - # Prefill: 1 node TP4 (no EP), Decode: 2 nodes TP8/EP8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - -backend: - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - MC_FORCE_MNNVL: "1" - SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" - FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - MC_FORCE_MNNVL: "1" - MC_TE_METRIC: "true" - SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" - FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" - SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - sglang_config: - prefill: - served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" - model-path: "/model/" - - - quantization: "fp8" - kv-cache-dtype: "fp8_e4m3" - - # Parallelism: simple TP4 on 1 node, no EP - tensor-parallel-size: 4 - data-parallel-size: 1 - expert-parallel-size: 1 - - # Mamba hybrid model settings - mamba-scheduler-strategy: "no_buffer" - mamba-track-interval: 2048 - mamba-ssm-dtype: "bfloat16" - - # PD disaggregation - disaggregation-mode: "prefill" - disable-radix-cache: true - - # Memory: same as AGG/1p1d configs - mem-fraction-static: 0.75 - chunked-prefill-size: 16384 - # context-length: 2020 # commented out because of accuracy test - - # Tell prefill about decode's actual TP/DP sizes for correct KV transfer - disaggregation-decode-tp: 8 - disaggregation-decode-dp: 8 - - load-balance-method: "round_robin" - watchdog-timeout: 1000000 - disable-cuda-graph: true - - decode: - served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" - model-path: "/model/" - - - quantization: "fp8" - kv-cache-dtype: "fp8_e4m3" - - # Parallelism: TP8/EP8 across 2 GB200 nodes - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - enable-dp-attention: true - - # Mamba hybrid model settings - mamba-scheduler-strategy: "no_buffer" - mamba-track-interval: 2048 - mamba-ssm-dtype: "bfloat16" - - # PD disaggregation - disaggregation-mode: "decode" - disable-radix-cache: true - disaggregation-bootstrap-port: 30001 - - mem-fraction-static: 0.60 - - max-mamba-cache-size: 3200 - max-running-requests: 640 - chunked-prefill-size: 16384 - context-length: 2020 - cuda-graph-max-bs: 128 - watchdog-timeout: 1000000 - - # DeepEP: low_latency for decode (requires rebuilt DeepEP with kNumMaxTopK=16) - moe-a2a-backend: "deepep" - deepep-mode: "low_latency" - ep-dispatch-algorithm: "static" - moe-dense-tp-size: 1 - enable-dp-lm-head: true - prefill-round-robin-balance: true - - # DeepGemm for MoE GEMM - moe-runner-backend: "deep_gemm" - - # Workload balance - eplb-algorithm: "deepseek" - -benchmark: - type: "gpqa" - num_examples: 198 - max_tokens: 65536 - repeat: 8 - num_threads: 128 diff --git a/recipes/qwen3.5/fp8/agg/mtp_radix_off/tp4-mtp-acc.yaml b/recipes/qwen3.5/fp8/agg/mtp_radix_off/tp4-mtp-acc.yaml new file mode 100644 index 000000000..f94b846a0 --- /dev/null +++ b/recipes/qwen3.5/fp8/agg/mtp_radix_off/tp4-mtp-acc.yaml @@ -0,0 +1,63 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated TP4 + MTP Accuracy Verification (GSM8K) +# NEXTN MTP speculative decoding, radix cache OFF + +name: "qwen3.5-agg-tp4-mtp-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + speculative-algorithm: "NEXTN" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + disable-radix-cache: true + mamba-scheduler-strategy: "no_buffer" + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/fp8/agg/mtp_radix_on/tp4-mtp-acc.yaml b/recipes/qwen3.5/fp8/agg/mtp_radix_on/tp4-mtp-acc.yaml new file mode 100644 index 000000000..9922fe8da --- /dev/null +++ b/recipes/qwen3.5/fp8/agg/mtp_radix_on/tp4-mtp-acc.yaml @@ -0,0 +1,63 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated TP4 + MTP Accuracy Verification (GSM8K) +# NEXTN MTP speculative decoding, radix cache ON (prefix caching enabled) + +name: "qwen3.5-agg-tp4-mtp-radix-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + speculative-algorithm: "NEXTN" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mamba-scheduler-strategy: "extra_buffer" + disable-radix-cache: false + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/nixl/agg-dep4.yaml b/recipes/qwen3.5/fp8/agg/profile/agg-dep4-profile.yaml similarity index 92% rename from recipes/qwen3.5/nixl/agg-dep4.yaml rename to recipes/qwen3.5/fp8/agg/profile/agg-dep4-profile.yaml index cbd1ea822..1f6579093 100644 --- a/recipes/qwen3.5/nixl/agg-dep4.yaml +++ b/recipes/qwen3.5/fp8/agg/profile/agg-dep4-profile.yaml @@ -5,7 +5,7 @@ name: "qwen3.5-agg-dep4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -30,7 +30,6 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" @@ -53,11 +52,17 @@ backend: max-prefill-tokens: 16384 context-length: 2020 cuda-graph-max-bs: 1024 - # enable-symm-mem: true + enable-symm-mem: true decode-log-interval: 1 stream-interval: 50 +profiling: + type: "torch" + aggregated: + start_step: 10 + stop_step: 20 + benchmark: type: "sa-bench" isl: 1000 diff --git a/recipes/qwen3.5/nixl/agg-tep4.yaml b/recipes/qwen3.5/fp8/agg/profile/agg-tep4-profile.yaml similarity index 91% rename from recipes/qwen3.5/nixl/agg-tep4.yaml rename to recipes/qwen3.5/fp8/agg/profile/agg-tep4-profile.yaml index 27d73327b..557840f8c 100644 --- a/recipes/qwen3.5/nixl/agg-tep4.yaml +++ b/recipes/qwen3.5/fp8/agg/profile/agg-tep4-profile.yaml @@ -5,7 +5,7 @@ name: "qwen3.5-agg-tep4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -30,7 +30,6 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" @@ -50,11 +49,17 @@ backend: max-prefill-tokens: 16384 context-length: 2020 cuda-graph-max-bs: 1024 - # enable-symm-mem: true + enable-symm-mem: true decode-log-interval: 1 stream-interval: 50 +profiling: + type: "torch" + aggregated: + start_step: 10 + stop_step: 20 + benchmark: type: "sa-bench" isl: 1000 diff --git a/recipes/qwen3.5/nixl/agg-tp4.yaml b/recipes/qwen3.5/fp8/agg/profile/agg-tp4-profile.yaml similarity index 90% rename from recipes/qwen3.5/nixl/agg-tp4.yaml rename to recipes/qwen3.5/fp8/agg/profile/agg-tp4-profile.yaml index 4514409a4..99915ff94 100644 --- a/recipes/qwen3.5/nixl/agg-tp4.yaml +++ b/recipes/qwen3.5/fp8/agg/profile/agg-tp4-profile.yaml @@ -5,7 +5,7 @@ name: "qwen3.5-agg-tp4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -29,7 +29,6 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" @@ -46,11 +45,17 @@ backend: max-prefill-tokens: 16384 context-length: 2020 cuda-graph-max-bs: 1024 - # enable-symm-mem: true + enable-symm-mem: true decode-log-interval: 1 stream-interval: 50 +profiling: + type: "torch" + aggregated: + start_step: 10 + stop_step: 20 + benchmark: type: "sa-bench" isl: 1000 diff --git a/recipes/qwen3.5/fp8/agg/stp_prefix_off/dep4-acc.yaml b/recipes/qwen3.5/fp8/agg/stp_prefix_off/dep4-acc.yaml new file mode 100644 index 000000000..3eb7e08a3 --- /dev/null +++ b/recipes/qwen3.5/fp8/agg/stp_prefix_off/dep4-acc.yaml @@ -0,0 +1,64 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated DEP4 Accuracy Verification (GSM8K) +# Data Expert Parallel: DP4 + TP4 + EP4 with dp-attention +# Prefix caching off, context-length enlarged for accuracy tasks + +name: "qwen3.5-agg-dep4-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/agg-dep4.yaml b/recipes/qwen3.5/fp8/agg/stp_prefix_off/dep4.yaml similarity index 92% rename from recipes/qwen3.5/agg-dep4.yaml rename to recipes/qwen3.5/fp8/agg/stp_prefix_off/dep4.yaml index cbd1ea822..5c5d7ce7a 100644 --- a/recipes/qwen3.5/agg-dep4.yaml +++ b/recipes/qwen3.5/fp8/agg/stp_prefix_off/dep4.yaml @@ -5,7 +5,7 @@ name: "qwen3.5-agg-dep4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -30,7 +30,6 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" @@ -53,7 +52,7 @@ backend: max-prefill-tokens: 16384 context-length: 2020 cuda-graph-max-bs: 1024 - # enable-symm-mem: true + # enable-symm-mem: true # may improve perf in some scenarios, benchmark before enabling decode-log-interval: 1 stream-interval: 50 diff --git a/recipes/qwen3.5/agg-tep4.yaml b/recipes/qwen3.5/fp8/agg/stp_prefix_off/tep4.yaml similarity index 92% rename from recipes/qwen3.5/agg-tep4.yaml rename to recipes/qwen3.5/fp8/agg/stp_prefix_off/tep4.yaml index 27d73327b..c2ff3c841 100644 --- a/recipes/qwen3.5/agg-tep4.yaml +++ b/recipes/qwen3.5/fp8/agg/stp_prefix_off/tep4.yaml @@ -5,7 +5,7 @@ name: "qwen3.5-agg-tep4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -30,7 +30,6 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" @@ -50,7 +49,7 @@ backend: max-prefill-tokens: 16384 context-length: 2020 cuda-graph-max-bs: 1024 - # enable-symm-mem: true + # enable-symm-mem: true # may improve perf in some scenarios, benchmark before enabling decode-log-interval: 1 stream-interval: 50 diff --git a/recipes/qwen3.5/fp8/agg/stp_prefix_off/tp4-acc.yaml b/recipes/qwen3.5/fp8/agg/stp_prefix_off/tp4-acc.yaml new file mode 100644 index 000000000..fb7938424 --- /dev/null +++ b/recipes/qwen3.5/fp8/agg/stp_prefix_off/tp4-acc.yaml @@ -0,0 +1,57 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated TP4 Accuracy Verification (GSM8K) +# Pure tensor parallel, no expert parallel +# Prefix caching off, context-length enlarged for accuracy tasks + +name: "qwen3.5-agg-tp4-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/agg-tp4.yaml b/recipes/qwen3.5/fp8/agg/stp_prefix_off/tp4.yaml similarity index 91% rename from recipes/qwen3.5/agg-tp4.yaml rename to recipes/qwen3.5/fp8/agg/stp_prefix_off/tp4.yaml index 4514409a4..b8f1326d5 100644 --- a/recipes/qwen3.5/agg-tp4.yaml +++ b/recipes/qwen3.5/fp8/agg/stp_prefix_off/tp4.yaml @@ -5,7 +5,7 @@ name: "qwen3.5-agg-tp4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -29,7 +29,6 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" @@ -46,7 +45,7 @@ backend: max-prefill-tokens: 16384 context-length: 2020 cuda-graph-max-bs: 1024 - # enable-symm-mem: true + # enable-symm-mem: true # may improve perf in some scenarios, benchmark before enabling decode-log-interval: 1 stream-interval: 50 diff --git a/recipes/qwen3.5/fp8/agg/stp_radix_on/dep4-acc.yaml b/recipes/qwen3.5/fp8/agg/stp_radix_on/dep4-acc.yaml new file mode 100644 index 000000000..7698f069a --- /dev/null +++ b/recipes/qwen3.5/fp8/agg/stp_radix_on/dep4-acc.yaml @@ -0,0 +1,64 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated DEP4 Accuracy Verification (GSM8K) +# Data Expert Parallel: DP4 + TP4 + EP4, radix cache ON (prefix caching enabled) + +name: "qwen3.5-agg-dep4-radix-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + mamba-scheduler-strategy: "extra_buffer" + disable-radix-cache: false + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/fp8/agg/stp_radix_on/tp4-acc.yaml b/recipes/qwen3.5/fp8/agg/stp_radix_on/tp4-acc.yaml new file mode 100644 index 000000000..28a9f7053 --- /dev/null +++ b/recipes/qwen3.5/fp8/agg/stp_radix_on/tp4-acc.yaml @@ -0,0 +1,57 @@ +# Qwen3.5-397B-A17B-FP8 Aggregated TP4 Accuracy Verification (GSM8K) +# Pure tensor parallel, radix cache ON (prefix caching enabled) + +name: "qwen3.5-agg-tp4-radix-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + + sglang_config: + aggregated: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + mamba-scheduler-strategy: "extra_buffer" + disable-radix-cache: false + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_off/1p1d-mtp-acc.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_off/1p1d-mtp-acc.yaml new file mode 100644 index 000000000..a81e53c37 --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_off/1p1d-mtp-acc.yaml @@ -0,0 +1,128 @@ +# Qwen3.5-397B-A17B-FP8 Disagg 1P1D + MTP Accuracy Verification (GPQA) +# TP4 Prefill + TP4 Decode with NEXTN MTP speculative decoding +# NO prefix caching: disable-radix-cache + no_buffer +# Purpose: verify disagg + MTP correctness without prefix caching + +name: "qwen3.5-1p1d-tp4-mtp-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # no prefix caching: disable-radix-cache + no_buffer + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + moe-runner-backend: "flashinfer_trtllm" + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2200 + + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # no prefix caching: disable-radix-cache + no_buffer + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + moe-runner-backend: "flashinfer_trtllm" + mamba-ssm-dtype: "bfloat16" + + + speculative-algorithm: "NEXTN" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + disaggregation-mode: "decode" + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2200 + + watchdog-timeout: 1000000 + + enable-multimodal: true + reasoning-parser: qwen3 + +benchmark: + type: "gpqa" + num_examples: 198 + max_tokens: 65536 + repeat: 8 + num_threads: 32 diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_off/1p1d-tp4-mtp.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_off/1p1d-tp4-mtp.yaml new file mode 100644 index 000000000..53e182254 --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_off/1p1d-tp4-mtp.yaml @@ -0,0 +1,116 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode + MTP (NEXTN MTP spec dec) +# Pure tensor parallel, no expert parallel, with speculative decoding +# 1k1k sa-bench concurrency sweep + +name: "qwen3.5-1p1d-tp4-mtp" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + tensor-parallel-size: 4 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + mamba-ssm-dtype: "bfloat16" + + speculative-algorithm: "NEXTN" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + disaggregation-mode: "decode" + disable-radix-cache: true + + max-running-requests: 1024 + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_on/1p1d-mtp-acc-prefixcache-retraction.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_on/1p1d-mtp-acc-prefixcache-retraction.yaml new file mode 100644 index 000000000..6e8f4a8fb --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_on/1p1d-mtp-acc-prefixcache-retraction.yaml @@ -0,0 +1,130 @@ +# Qwen3.5-397B-A17B-FP8 Disagg 1P1D + MTP + Prefix Caching Accuracy Verification (GPQA) +# TP4 Prefill + TP4 Decode with NEXTN MTP speculative decoding +# WITH prefix caching: extra_buffer + radix cache enabled +# Purpose: reproduce/investigate low accuracy (0.71/0.61 on GSM8K) in disagg + MTP + prefix caching +# +# Known issue: Disagg + Prefix Caching + MTP shows significantly degraded accuracy +# GSM8K Instruct/Thinking: 0.71/0.61 vs expected ~0.94/0.93 + +name: "qwen3.5-1p1d-tp4-mtp-acc-prefixcache" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # prefix caching enabled: extra_buffer + radix cache (no disable-radix-cache) + mamba-scheduler-strategy: "extra_buffer" + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + moe-runner-backend: "flashinfer_trtllm" + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2200 + + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # prefix caching enabled: extra_buffer + radix cache (no disable-radix-cache) + mamba-scheduler-strategy: "extra_buffer" + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + moe-runner-backend: "flashinfer_trtllm" + mamba-ssm-dtype: "bfloat16" + max-running-requests: 250 + + + speculative-algorithm: "NEXTN" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + disaggregation-mode: "decode" + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2200 + + watchdog-timeout: 1000000 + + enable-multimodal: true + reasoning-parser: qwen3 + +benchmark: + type: "gpqa" + num_examples: 198 + max_tokens: 65536 + repeat: 8 + num_threads: 128 diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_on/1p1d-mtp-acc-prefixcache.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_on/1p1d-mtp-acc-prefixcache.yaml new file mode 100644 index 000000000..0b6b0a91e --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/mtp_radix_on/1p1d-mtp-acc-prefixcache.yaml @@ -0,0 +1,126 @@ +# Qwen3.5-397B-A17B-FP8 Disagg 1P1D + MTP + Prefix Caching Accuracy Verification (GPQA) +# TP4 Prefill + TP4 Decode with NEXTN MTP speculative decoding +# Prefill: extra_buffer + radix cache (prefix caching on prefill side) +# IMPORTANT: sglang ratio calc bug when disable_radix_cache=True + extra_buffer + spec dec: +# ratio is hardcoded to 1, ignoring extra_buffer's ping-pong overhead (2 mamba slots/req). + +name: "qwen3.5-1p1d-tp4-mtp-acc-prefixcache" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + # prefix caching enabled: extra_buffer + radix cache (no disable-radix-cache) + mamba-scheduler-strategy: "extra_buffer" + mamba-track-interval: 2048 + moe-runner-backend: "flashinfer_trtllm" + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2200 + + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + max-running-requests: 64 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + mamba-scheduler-strategy: "extra_buffer" + mamba-track-interval: 2048 + moe-runner-backend: "flashinfer_trtllm" + mamba-ssm-dtype: "bfloat16" + max-running-requests: 64 + + + speculative-algorithm: "NEXTN" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + disaggregation-mode: "decode" + + mem-fraction-static: 0.75 + chunked-prefill-size: 16384 + context-length: 2200 + + watchdog-timeout: 1000000 + + enable-multimodal: true + reasoning-parser: qwen3 + +benchmark: + type: "gpqa" + num_examples: 198 + max_tokens: 65536 + repeat: 1 + num_threads: 64 diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-dep4-nsys-profile.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-dep4-nsys-profile.yaml new file mode 100644 index 000000000..4d125c89f --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-dep4-nsys-profile.yaml @@ -0,0 +1,132 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + DEP4 Decode +# Staging buffer + async scatter — nsys profiling v3 +# Push decode start_step to 200 to skip cold-start; keep OSL=128 (proven stable) +# Workload: 1k input / 128 output, concurrency 128 + +name: "qwen3.5-1p1d-dep4-nsys-profile" + +model: + path: "qwen3.5-fp8" + container: "dev-0318" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGG_STAGING_BUFFER: "1" + SGLANG_DISAGG_STAGING_BUFFER_SIZE_MB: "128" + SGLANG_LOG_FORWARD_ITERS: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGG_STAGING_BUFFER: "1" + SGLANG_DISAGG_STAGING_BUFFER_SIZE_MB: "128" + SGLANG_LOG_FORWARD_ITERS: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +profiling: + type: "nsys" + prefill: + start_step: 10 + stop_step: 15 + decode: + start_step: 200 + stop_step: 400 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 128 + concurrencies: "128" + req_rate: "10" diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-tp4-mtp-profile.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-tp4-mtp-profile.yaml new file mode 100644 index 000000000..5997e7516 --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-tp4-mtp-profile.yaml @@ -0,0 +1,125 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode + MTP (NEXTN MTP spec dec) +# Pure tensor parallel, no expert parallel, with speculative decoding +# 1k1k sa-bench concurrency sweep + +name: "qwen3.5-1p1d-tp4-mtp" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + tensor-parallel-size: 4 + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing + mamba-ssm-dtype: "bfloat16" + + speculative-algorithm: "NEXTN" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + disaggregation-mode: "decode" + disable-radix-cache: true + + max-running-requests: 1024 + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +profiling: + type: "torch" + prefill: + start_step: 10 + stop_step: 20 + decode: + start_step: 10 + stop_step: 20 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/1p1d-tp4-tp4-mtp.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-tp4-profile.yaml similarity index 82% rename from recipes/qwen3.5/1p1d-tp4-tp4-mtp.yaml rename to recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-tp4-profile.yaml index 22b98c8b2..04d56d1b3 100644 --- a/recipes/qwen3.5/1p1d-tp4-tp4-mtp.yaml +++ b/recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-tp4-profile.yaml @@ -1,11 +1,12 @@ -# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode + MTP (EAGLE spec dec) -# Pure tensor parallel, no expert parallel, with speculative decoding +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode +# Pure tensor parallel, no expert parallel +# 1k1k sa-bench concurrency sweep -name: "qwen3.5-1p1d-tp4-tp4-mtp" +name: "qwen3.5-1p1d-tp4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -53,9 +54,7 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" - quantization: "fp8" kv-cache-dtype: "fp8_e4m3" tensor-parallel-size: 4 @@ -63,52 +62,56 @@ backend: expert-parallel-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 1 - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" tensor-parallel-size: 4 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - disaggregation-mode: "decode" - disable-radix-cache: true - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 watchdog-timeout: 1000000 +profiling: + type: "torch" + prefill: + start_step: 10 + stop_step: 20 + decode: + start_step: 10 + stop_step: 20 + benchmark: type: "sa-bench" isl: 1000 osl: 1000 - concurrencies: "8x32x128x256x512x1024" + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" req_rate: "inf" diff --git a/recipes/qwen3.5/experimental/1p1d-tp4-dep4.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4-acc.yaml similarity index 84% rename from recipes/qwen3.5/experimental/1p1d-tp4-dep4.yaml rename to recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4-acc.yaml index a1766ff51..2e5bb5023 100644 --- a/recipes/qwen3.5/experimental/1p1d-tp4-dep4.yaml +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4-acc.yaml @@ -1,11 +1,11 @@ # Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + DEP4 Decode -# Decode: Data Expert Parallel (DP4 + TP4 + EP4) with dp-attention +# Accuracy Verification (GSM8K), prefix caching off, no staging buffer -name: "qwen3.5-1p1d-tp4-dep4" +name: "qwen3.5-1p1d-dep4-acc" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -55,9 +55,7 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" - quantization: "fp8" kv-cache-dtype: "fp8_e4m3" tensor-parallel-size: 4 @@ -65,29 +63,26 @@ backend: expert-parallel-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 4 - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 + max-running-requests: 128 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" # DEP4: DP4 + TP4 + EP4 with dp-attention tensor-parallel-size: 4 @@ -98,20 +93,23 @@ backend: moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "decode" - disable-radix-cache: true - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 + max-running-requests: 128 + cuda-graph-max-bs: 128 + decode-log-interval: 1 + stream-interval: 50 watchdog-timeout: 1000000 benchmark: - type: "sa-bench" - isl: 1000 - osl: 1000 - concurrencies: "8x32x128x256x512x1024" - req_rate: "inf" + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/1p1d-dep4-dep4.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4dep4.yaml similarity index 94% rename from recipes/qwen3.5/1p1d-dep4-dep4.yaml rename to recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4dep4.yaml index a76d2f90a..e98dd6956 100644 --- a/recipes/qwen3.5/1p1d-dep4-dep4.yaml +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4dep4.yaml @@ -1,12 +1,13 @@ # Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + DEP4 Decode # Both sides use Data Expert Parallel (DP4 + TP4 + EP4) with dp-attention # Homogeneous TP layout to avoid KV/Mamba state slice transfer overhead +# 1k1k sa-bench concurrency sweep -name: "qwen3.5-1p1d-dep4-dep4" +name: "qwen3.5-1p1d-dep4dep4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev-0318" precision: "fp8" resources: @@ -70,20 +71,17 @@ backend: moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 4 mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" @@ -103,20 +101,19 @@ backend: moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" - mamba-track-interval: 2048 + disable-radix-cache: true + mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing mamba-ssm-dtype: "bfloat16" disaggregation-mode: "decode" - disable-radix-cache: true mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 cuda-graph-max-bs: 1024 - watchdog-timeout: 1000000 - decode-log-interval: 1 stream-interval: 50 + watchdog-timeout: 1000000 benchmark: type: "sa-bench" diff --git a/recipes/qwen3.5/nixl/1p1d-dep4-dep4.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-gpqa.yaml similarity index 77% rename from recipes/qwen3.5/nixl/1p1d-dep4-dep4.yaml rename to recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-gpqa.yaml index 7d870677c..1895495b9 100644 --- a/recipes/qwen3.5/nixl/1p1d-dep4-dep4.yaml +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-gpqa.yaml @@ -1,12 +1,14 @@ -# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + DEP4 Decode -# Both sides use Data Expert Parallel (DP4 + TP4 + EP4) with dp-attention -# Homogeneous TP layout to avoid KV/Mamba state slice transfer overhead +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + TP4 Decode +# Prefill uses DEP4 (DP4 + TP4 + EP4 with dp-attention) +# Decode uses pure TP4 +# No GPU staging buffer (direct scatter RDMA) +# GPQA accuracy benchmark -name: "qwen3.5-1p1d-dep4-dep4-nixl" +name: "qwen3.5-1p1d-dep4tp4-gpqa" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -61,7 +63,7 @@ backend: kv-cache-dtype: "fp8_e4m3" moe-runner-backend: "flashinfer_trtllm" - # DEP4: DP4 + TP4 + EP4 with dp-attention (same layout as decode) + # DEP4: DP4 + TP4 + EP4 with dp-attention (same layout as dep4dep4 prefill) tensor-parallel-size: 4 data-parallel-size: 4 expert-parallel-size: 4 @@ -70,21 +72,16 @@ backend: moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 4 mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" @@ -95,34 +92,26 @@ backend: kv-cache-dtype: "fp8_e4m3" moe-runner-backend: "flashinfer_trtllm" - # DEP4: DP4 + TP4 + EP4 with dp-attention + # TP4: pure tensor parallel, no dp-attention tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl - disable-radix-cache: true mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 cuda-graph-max-bs: 1024 - watchdog-timeout: 1000000 - decode-log-interval: 1 stream-interval: 50 + watchdog-timeout: 1000000 benchmark: - type: "sa-bench" - isl: 1000 - osl: 1000 - concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" - req_rate: "inf" + type: "gpqa" + num_examples: 198 + max_tokens: 65536 + repeat: 8 + num_threads: 128 diff --git a/recipes/qwen3.5/1p1d-tep4-tep4.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-gsm8k-bench.yaml similarity index 73% rename from recipes/qwen3.5/1p1d-tep4-tep4.yaml rename to recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-gsm8k-bench.yaml index 90f8ad375..7b4dd382f 100644 --- a/recipes/qwen3.5/1p1d-tep4-tep4.yaml +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-gsm8k-bench.yaml @@ -1,11 +1,14 @@ -# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TEP4 Prefill + TEP4 Decode -# Both sides use Tensor Expert Parallel (TP4 + EP4), no dp-attention +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + TP4 Decode +# Prefill uses DEP4 (DP4 + TP4 + EP4 with dp-attention) +# Decode uses pure TP4 +# No GPU staging buffer (direct scatter RDMA) +# GSM8K-Bench accuracy benchmark (original bench style, 20-shot, 1319 questions) -name: "qwen3.5-1p1d-tep4-tep4" +name: "qwen3.5-1p1d-dep4tp4-gsm8k-bench" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -55,61 +58,60 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" - # TEP4: TP4 + EP4, standard TP attention (no dp-attention) + # DEP4: DP4 + TP4 + EP4 with dp-attention (same layout as dep4dep4 prefill) tensor-parallel-size: 4 + data-parallel-size: 4 expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 1 - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" - # TEP4: TP4 + EP4, standard TP attention (no dp-attention) + # TP4: pure tensor parallel, no dp-attention tensor-parallel-size: 4 - expert-parallel-size: 4 - moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "decode" - disable-radix-cache: true - mem-fraction-static: 0.70 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 watchdog-timeout: 1000000 benchmark: - type: "sa-bench" - isl: 1000 - osl: 1000 - concurrencies: "8x32x128x256x512x1024" - req_rate: "inf" + type: "gsm8k-bench" + num_examples: 1319 + num_shots: 20 + max_tokens: 512 + num_threads: 256 diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-staging-gpqa.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-staging-gpqa.yaml new file mode 100644 index 000000000..0110f7ba2 --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-staging-gpqa.yaml @@ -0,0 +1,121 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + TP4 Decode +# Prefill uses DEP4 (DP4 + TP4 + EP4 with dp-attention) +# Decode uses pure TP4 +# GPU staging buffer enabled (bulk RDMA, ~1000x fewer RDMA WRs) +# GPQA accuracy benchmark + +name: "qwen3.5-1p1d-dep4tp4-staging-gpqa" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGG_STAGING_BUFFER: "1" + SGLANG_DISAGG_STAGING_BUFFER_SIZE_MB: "32" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGG_STAGING_BUFFER: "1" + SGLANG_DISAGG_STAGING_POOL_SIZE_MB: "4096" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # DEP4: DP4 + TP4 + EP4 with dp-attention (same layout as dep4dep4 prefill) + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # TP4: pure tensor parallel, no dp-attention + tensor-parallel-size: 4 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +benchmark: + type: "gpqa" + num_examples: 198 + max_tokens: 65536 + repeat: 8 + num_threads: 128 diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-staging-gsm8k-bench.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-staging-gsm8k-bench.yaml new file mode 100644 index 000000000..0a4052345 --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-dep4tp4-staging-gsm8k-bench.yaml @@ -0,0 +1,121 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + TP4 Decode +# Prefill uses DEP4 (DP4 + TP4 + EP4 with dp-attention) +# Decode uses pure TP4 +# GPU staging buffer enabled (bulk RDMA, ~1000x fewer RDMA WRs) +# GSM8K-Bench accuracy benchmark (original bench style, 20-shot, 1319 questions) + +name: "qwen3.5-1p1d-dep4tp4-staging-gsm8k-bench" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGG_STAGING_BUFFER: "1" + SGLANG_DISAGG_STAGING_BUFFER_SIZE_MB: "32" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGG_STAGING_BUFFER: "1" + SGLANG_DISAGG_STAGING_POOL_SIZE_MB: "4096" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # DEP4: DP4 + TP4 + EP4 with dp-attention (same layout as dep4dep4 prefill) + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # TP4: pure tensor parallel, no dp-attention + tensor-parallel-size: 4 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +benchmark: + type: "gsm8k-bench" + num_examples: 1319 + num_shots: 20 + max_tokens: 512 + num_threads: 256 diff --git a/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-tp4-acc.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-tp4-acc.yaml new file mode 100644 index 000000000..1a3aebbd5 --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-tp4-acc.yaml @@ -0,0 +1,107 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode +# Accuracy Verification (GSM8K), prefix caching off, no staging buffer + +name: "qwen3.5-1p1d-tp4-acc" + +model: + path: "qwen3.5-fp8" + container: "dev" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + max-running-requests: 128 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + tensor-parallel-size: 4 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + max-running-requests: 128 + cuda-graph-max-bs: 128 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/1p1d-tp4-tp4.yaml b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-tp4.yaml similarity index 88% rename from recipes/qwen3.5/1p1d-tp4-tp4.yaml rename to recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-tp4.yaml index dfb1776ea..96e0e0472 100644 --- a/recipes/qwen3.5/1p1d-tp4-tp4.yaml +++ b/recipes/qwen3.5/fp8/disagg/mooncake/stp_prefix_off/1p1d-tp4.yaml @@ -1,11 +1,12 @@ # Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode # Pure tensor parallel, no expert parallel +# 1k1k sa-bench concurrency sweep -name: "qwen3.5-1p1d-tp4-tp4" +name: "qwen3.5-1p1d-tp4" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -53,9 +54,7 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" - quantization: "fp8" kv-cache-dtype: "fp8_e4m3" tensor-parallel-size: 4 @@ -63,47 +62,47 @@ backend: expert-parallel-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 1 - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" tensor-parallel-size: 4 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "decode" - disable-radix-cache: true - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 watchdog-timeout: 1000000 benchmark: type: "sa-bench" isl: 1000 osl: 1000 - concurrencies: "8x32x128x256x512x1024" + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" req_rate: "inf" diff --git a/recipes/qwen3.5/nixl/1p1d-tep4-tep4.yaml b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-acc.yaml similarity index 77% rename from recipes/qwen3.5/nixl/1p1d-tep4-tep4.yaml rename to recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-acc.yaml index b617da45b..23ee987f3 100644 --- a/recipes/qwen3.5/nixl/1p1d-tep4-tep4.yaml +++ b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-acc.yaml @@ -1,11 +1,11 @@ -# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TEP4 Prefill + TEP4 Decode -# Both sides use Tensor Expert Parallel (TP4 + EP4), no dp-attention +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + DEP4 Decode +# Accuracy Verification (GSM8K), NIXL transfer backend -name: "qwen3.5-1p1d-tep4-tep4-nixl" +name: "qwen3.5-1p1d-dep4-nixl-acc" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -55,63 +55,63 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" - quantization: "fp8" kv-cache-dtype: "fp8_e4m3" - # TEP4: TP4 + EP4, standard TP attention (no dp-attention) tensor-parallel-size: 4 - expert-parallel-size: 4 - moe-dense-tp-size: 1 + data-parallel-size: 1 + expert-parallel-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" disaggregation-transfer-backend: nixl - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 1 - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 + max-running-requests: 128 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" - # TEP4: TP4 + EP4, standard TP attention (no dp-attention) + # DEP4: DP4 + TP4 + EP4 with dp-attention tensor-parallel-size: 4 + data-parallel-size: 4 expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true moe-dense-tp-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "decode" disaggregation-transfer-backend: nixl - disable-radix-cache: true - mem-fraction-static: 0.70 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 + max-running-requests: 128 + cuda-graph-max-bs: 128 + decode-log-interval: 1 + stream-interval: 50 watchdog-timeout: 1000000 benchmark: - type: "sa-bench" - isl: 1000 - osl: 1000 - concurrencies: "8x32x128x256x512x1024" - req_rate: "inf" + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-gpqa.yaml b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-gpqa.yaml new file mode 100644 index 000000000..ae06a008d --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-gpqa.yaml @@ -0,0 +1,116 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + DEP4 Decode +# NIXL transfer backend, no staging buffer +# Heterogeneous TP: prefill attn_tp=4, decode attn_tp=1 (dp-attention) +# Tests Mamba state slice transfer + engine_rank notification fix + +name: "qwen3.5-1p1d-dep4-nixl-gpqa" + +model: + path: "qwen3.5-fp8" + container: "dev-0318" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +slurm: + time_limit: "3:00:00" + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +benchmark: + type: "gpqa" + num_examples: 198 + max_tokens: 65536 + repeat: 8 + num_threads: 128 diff --git a/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-gsm8k.yaml b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-gsm8k.yaml new file mode 100644 index 000000000..95a765223 --- /dev/null +++ b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-dep4-nixl-gsm8k.yaml @@ -0,0 +1,116 @@ +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + DEP4 Decode +# NIXL transfer backend, no staging buffer +# Heterogeneous TP: prefill attn_tp=4, decode attn_tp=1 (dp-attention) +# Tests Mamba state slice transfer + engine_rank notification fix + +name: "qwen3.5-1p1d-dep4-nixl-gsm8k" + +model: + path: "qwen3.5-fp8" + container: "dev-0318" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +slurm: + time_limit: "2:00:00" + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.80 + chunked-prefill-size: 16384 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + watchdog-timeout: 1000000 + +benchmark: + type: "gsm8k" + num_examples: 1319 + num_shots: 8 + max_tokens: 512 + num_threads: 256 diff --git a/recipes/qwen3.5/nixl/1p1d-tp4-tp4-mtp.yaml b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-tp4-nixl-acc.yaml similarity index 79% rename from recipes/qwen3.5/nixl/1p1d-tp4-tp4-mtp.yaml rename to recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-tp4-nixl-acc.yaml index c4f7f1acd..de3cfadd3 100644 --- a/recipes/qwen3.5/nixl/1p1d-tp4-tp4-mtp.yaml +++ b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-tp4-nixl-acc.yaml @@ -1,11 +1,11 @@ -# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode + MTP (EAGLE spec dec) -# Pure tensor parallel, no expert parallel, with speculative decoding +# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode +# Accuracy Verification (GSM8K), NIXL transfer backend -name: "qwen3.5-1p1d-tp4-tp4-mtp-nixl" +name: "qwen3.5-1p1d-tp4-nixl-acc" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -53,9 +53,7 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" - quantization: "fp8" kv-cache-dtype: "fp8_e4m3" tensor-parallel-size: 4 @@ -63,54 +61,49 @@ backend: expert-parallel-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" disaggregation-transfer-backend: nixl - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 1 - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 + max-running-requests: 128 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" tensor-parallel-size: 4 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" - speculative-algorithm: "EAGLE" - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 - disaggregation-mode: "decode" disaggregation-transfer-backend: nixl - disable-radix-cache: true - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 - context-length: 2020 + max-running-requests: 128 + cuda-graph-max-bs: 128 + decode-log-interval: 1 + stream-interval: 50 watchdog-timeout: 1000000 benchmark: - type: "sa-bench" - isl: 1000 - osl: 1000 - concurrencies: "8x32x128x256x512x1024" - req_rate: "inf" + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/nixl/1p1d-tp4-tp4.yaml b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-tp4-nixl.yaml similarity index 88% rename from recipes/qwen3.5/nixl/1p1d-tp4-tp4.yaml rename to recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-tp4-nixl.yaml index 13b4d045b..c22788789 100644 --- a/recipes/qwen3.5/nixl/1p1d-tp4-tp4.yaml +++ b/recipes/qwen3.5/fp8/disagg/nixl/stp_prefix_off/1p1d-tp4-nixl.yaml @@ -1,11 +1,13 @@ # Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: TP4 Prefill + TP4 Decode # Pure tensor parallel, no expert parallel +# 1k1k sa-bench concurrency sweep +# Using NIXL transfer backend -name: "qwen3.5-1p1d-tp4-tp4-nixl" +name: "qwen3.5-1p1d-tp4-nixl" model: path: "qwen3.5-fp8" - container: "dev" # docker://lmsysorg/sglang:dev + container: "dev" precision: "fp8" resources: @@ -53,9 +55,7 @@ backend: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" - quantization: "fp8" kv-cache-dtype: "fp8_e4m3" tensor-parallel-size: 4 @@ -63,49 +63,49 @@ backend: expert-parallel-size: 1 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "prefill" disaggregation-transfer-backend: nixl - disable-radix-cache: true - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 1 - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 load-balance-method: "round_robin" watchdog-timeout: 1000000 - disable-cuda-graph: true decode: served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" model-path: "/model/" - attention-backend: "trtllm_mha" quantization: "fp8" kv-cache-dtype: "fp8_e4m3" + moe-runner-backend: "flashinfer_trtllm" tensor-parallel-size: 4 mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true mamba-track-interval: 2048 mamba-ssm-dtype: "bfloat16" disaggregation-mode: "decode" disaggregation-transfer-backend: nixl - disable-radix-cache: true - mem-fraction-static: 0.75 + mem-fraction-static: 0.80 chunked-prefill-size: 16384 context-length: 2020 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 watchdog-timeout: 1000000 benchmark: type: "sa-bench" isl: 1000 osl: 1000 - concurrencies: "8x32x128x256x512x1024" + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" req_rate: "inf" diff --git a/recipes/qwen3.5/nvfp4/agg/mtp_radix_off/.gitkeep b/recipes/qwen3.5/nvfp4/agg/mtp_radix_off/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/recipes/qwen3.5/nvfp4/agg/mtp_radix_on/tp4-mtp-acc.yaml b/recipes/qwen3.5/nvfp4/agg/mtp_radix_on/tp4-mtp-acc.yaml new file mode 100644 index 000000000..7ada931c7 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/mtp_radix_on/tp4-mtp-acc.yaml @@ -0,0 +1,83 @@ +# FlashInfer GDN decode validation: NVFP4, linear-attn-decode-backend=flashinfer (ladfi), +# extra_buffer Mamba scheduler, NEXTN speculative decoding (MTP), concurrency 512. + +name: "nvfp4-agg-tp4-mtp-acc" + +setup_script: "upgrade-flashinfer-v067.sh" + +model: + path: "qwen3.5-nvfp4" + container: "dev-0318" + precision: "fp4" + +frontend: + type: "sglang" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +slurm: + time_limit: "4:00:00" + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-v067-cache" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + tensor-parallel-size: 4 + expert-parallel-size: 1 + + quantization: modelopt_fp4 + kv-cache-dtype: fp8_e4m3 + + mamba-ssm-dtype: "bfloat16" + + attention-backend: trtllm_mha + moe-runner-backend: flashinfer_trtllm + linear-attn-decode-backend: flashinfer + + mamba-scheduler-strategy: "extra_buffer" + disable-radix-cache: false + mem-fraction-static: 0.85 + chunked-prefill-size: 32768 + max-prefill-tokens: 32768 + + cuda-graph-max-bs: 512 + max-running-requests: 512 + + scheduler-recv-interval: 30 + stream-interval: 30 + + watchdog-timeout: 600 + + speculative-algorithm: NEXTN + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 512 + num_shots: 8 + temperature: 0.6 + top_p: 0.95 + top_k: 20 diff --git a/recipes/qwen3.5/nvfp4/agg/profile/tp4-profile.yaml b/recipes/qwen3.5/nvfp4/agg/profile/tp4-profile.yaml new file mode 100644 index 000000000..3b1527aa6 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/profile/tp4-profile.yaml @@ -0,0 +1,80 @@ +# NVFP4 Qwen3.5-397B aggregated TP4 — torch profiler trace (no_buffer mode) +# Base: sweep-nobuf-conc64.yaml +# Goal: confirm which GEMM kernels are FP4 vs BF16 at conc=64 + +name: "nvfp4-agg-tp4-profile" + +model: + path: "qwen3.5-nvfp4" + container: "dev-0318" + precision: "fp4" + +frontend: + type: "sglang" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +slurm: + time_limit: "2:00:00" + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + SGL_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-gdc-cache" + SGLANG_LOG_FORWARD_ITERS: "1" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + tensor-parallel-size: 4 + expert-parallel-size: 1 + + quantization: modelopt_fp4 + kv-cache-dtype: fp8_e4m3 + fp4-gemm-backend: flashinfer_cutlass + + mamba-ssm-dtype: "bfloat16" + + attention-backend: trtllm_mha + moe-runner-backend: flashinfer_trtllm + + mamba-scheduler-strategy: "no_buffer" + disable-radix-cache: true + mem-fraction-static: 0.85 + chunked-prefill-size: 32768 + max-prefill-tokens: 32768 + + cuda-graph-max-bs: 64 + max-running-requests: 64 + + scheduler-recv-interval: 30 + stream-interval: 30 + + watchdog-timeout: 600 + +profiling: + type: "torch" + aggregated: + start_step: 50 + stop_step: 70 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 128 + concurrencies: "64" + req_rate: "10" diff --git a/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/dep4-acc.yaml b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/dep4-acc.yaml new file mode 100644 index 000000000..fee037983 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/dep4-acc.yaml @@ -0,0 +1,63 @@ +# Qwen3.5-397B-A17B-NVFP4 Aggregated DEP4 Accuracy Verification (GSM8K) +# Data Expert Parallel: DP4 + TP4 + EP4 with dp-attention + +name: "nvfp4-agg-dep4-acc" + +model: + path: "qwen3.5-nvfp4" + container: "dev" + precision: "fp4" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-gdc-cache" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/dep4.yaml b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/dep4.yaml new file mode 100644 index 000000000..f9fc4b481 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/dep4.yaml @@ -0,0 +1,65 @@ +# Qwen3.5-397B-A17B-NVFP4 Aggregated DEP4 on GB200 +# Data Expert Parallel: DP4 + TP4 + EP4 with dp-attention + +name: "nvfp4-agg-dep4" + +model: + path: "qwen3.5-nvfp4" + container: "dev" + precision: "fp4" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-gdc-cache" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + # DEP4: DP4 + TP4 + EP4 with dp-attention + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + # enable-symm-mem: true # may improve perf in some scenarios, benchmark before enabling + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tep4.yaml b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tep4.yaml new file mode 100644 index 000000000..487934d88 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tep4.yaml @@ -0,0 +1,62 @@ +# Qwen3.5-397B-A17B-NVFP4 Aggregated TEP4 on GB200 +# Tensor Expert Parallel: TP4 + EP4, no dp-attention, symmetric memory enabled + +name: "nvfp4-agg-tep4" + +model: + path: "qwen3.5-nvfp4" + container: "dev" + precision: "fp4" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-gdc-cache" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + # TEP4: TP4 + EP4, standard TP attention (no dp-attention) + tensor-parallel-size: 4 + expert-parallel-size: 4 + moe-dense-tp-size: 1 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + # enable-symm-mem: true # may improve perf in some scenarios, benchmark before enabling + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tp4-acc.yaml b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tp4-acc.yaml new file mode 100644 index 000000000..fc807fa80 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tp4-acc.yaml @@ -0,0 +1,56 @@ +# Qwen3.5-397B-A17B-NVFP4 Aggregated TP4 Accuracy Verification (GSM8K) +# Pure tensor parallel, no expert parallel + +name: "nvfp4-agg-tp4-acc" + +model: + path: "qwen3.5-nvfp4" + container: "dev" + precision: "fp4" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-gdc-cache" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 128 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + cuda-graph-max-bs: 128 + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 128 + num_shots: 8 diff --git a/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tp4.yaml b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tp4.yaml new file mode 100644 index 000000000..c26bfb852 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/stp_prefix_off/tp4.yaml @@ -0,0 +1,58 @@ +# Qwen3.5-397B-A17B-NVFP4 Aggregated TP4 on GB200 +# Pure tensor parallel, no expert parallel, symmetric memory enabled + +name: "nvfp4-agg-tp4" + +model: + path: "qwen3.5-nvfp4" + container: "dev" + precision: "fp4" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-gdc-cache" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + attention-backend: "trtllm_mha" + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 2020 + cuda-graph-max-bs: 1024 + # enable-symm-mem: true # may improve perf in some scenarios, benchmark before enabling + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1000 + osl: 1000 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" diff --git a/recipes/qwen3.5/nvfp4/agg/stp_radix_on/tp4-acc.yaml b/recipes/qwen3.5/nvfp4/agg/stp_radix_on/tp4-acc.yaml new file mode 100644 index 000000000..b2358e643 --- /dev/null +++ b/recipes/qwen3.5/nvfp4/agg/stp_radix_on/tp4-acc.yaml @@ -0,0 +1,77 @@ +# FlashInfer GDN decode validation: NVFP4, linear-attn-decode-backend=flashinfer (ladfi), +# extra_buffer Mamba scheduler (radix cache on), no MTP, concurrency 512. + +name: "nvfp4-agg-tp4-radix-acc" + +setup_script: "upgrade-flashinfer-v067.sh" + +model: + path: "qwen3.5-nvfp4" + container: "dev-0318" + precision: "fp4" + +frontend: + type: "sglang" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + +slurm: + time_limit: "4:00:00" + +backend: + type: sglang + + aggregated_environment: + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-v067-cache" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + + sglang_config: + aggregated: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + + tensor-parallel-size: 4 + expert-parallel-size: 1 + + quantization: modelopt_fp4 + kv-cache-dtype: fp8_e4m3 + + mamba-ssm-dtype: "bfloat16" + + attention-backend: trtllm_mha + moe-runner-backend: flashinfer_trtllm + linear-attn-decode-backend: flashinfer + + mamba-scheduler-strategy: "extra_buffer" + disable-radix-cache: false + mem-fraction-static: 0.85 + chunked-prefill-size: 32768 + max-prefill-tokens: 32768 + + cuda-graph-max-bs: 512 + max-running-requests: 512 + + scheduler-recv-interval: 30 + stream-interval: 30 + + watchdog-timeout: 600 + +benchmark: + type: "gsm8k" + num_examples: 1319 + max_tokens: 16000 + num_threads: 512 + num_shots: 8 + temperature: 0.6 + top_p: 0.95 + top_k: 20 diff --git a/recipes/qwen3.5/nvfp4/disagg/mtp_radix_off/.gitkeep b/recipes/qwen3.5/nvfp4/disagg/mtp_radix_off/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/recipes/qwen3.5/nvfp4/disagg/mtp_radix_on/.gitkeep b/recipes/qwen3.5/nvfp4/disagg/mtp_radix_on/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/recipes/qwen3.5/nvfp4/disagg/profile/.gitkeep b/recipes/qwen3.5/nvfp4/disagg/profile/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/recipes/qwen3.5/nvfp4/disagg/stp_prefix_off/.gitkeep b/recipes/qwen3.5/nvfp4/disagg/stp_prefix_off/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/recipes/qwen3.5/nvfp4/disagg/stp_radix_on/.gitkeep b/recipes/qwen3.5/nvfp4/disagg/stp_radix_on/.gitkeep new file mode 100644 index 000000000..e69de29bb