Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 0 additions & 147 deletions recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml

This file was deleted.

63 changes: 63 additions & 0 deletions recipes/qwen3.5/fp8/agg/mtp_radix_off/tp4-mtp-acc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Qwen3.5-397B-A17B-FP8 Aggregated TP4 + MTP Accuracy Verification (GSM8K)
# NEXTN MTP speculative decoding, radix cache OFF

name: "qwen3.5-agg-tp4-mtp-acc"

model:
path: "qwen3.5-fp8"
container: "dev"
precision: "fp8"

resources:
gpu_type: "gb200"
gpus_per_node: 4
agg_nodes: 1
agg_workers: 1

backend:
type: sglang

aggregated_environment:
SGLANG_ENABLE_SPEC_V2: "1"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"

sglang_config:
aggregated:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"

attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"

tensor-parallel-size: 4

mamba-ssm-dtype: "bfloat16"
moe-runner-backend: "flashinfer_trtllm"

speculative-algorithm: "NEXTN"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

disable-radix-cache: true
mamba-scheduler-strategy: "no_buffer"
max-running-requests: 128
mem-fraction-static: 0.8
chunked-prefill-size: 16384
max-prefill-tokens: 16384
cuda-graph-max-bs: 128

decode-log-interval: 1
stream-interval: 50

benchmark:
type: "gsm8k"
num_examples: 1319
max_tokens: 16000
num_threads: 128
num_shots: 8
63 changes: 63 additions & 0 deletions recipes/qwen3.5/fp8/agg/mtp_radix_on/tp4-mtp-acc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Qwen3.5-397B-A17B-FP8 Aggregated TP4 + MTP Accuracy Verification (GSM8K)
# NEXTN MTP speculative decoding, radix cache ON (prefix caching enabled)

name: "qwen3.5-agg-tp4-mtp-radix-acc"

model:
path: "qwen3.5-fp8"
container: "dev"
precision: "fp8"

resources:
gpu_type: "gb200"
gpus_per_node: 4
agg_nodes: 1
agg_workers: 1

backend:
type: sglang

aggregated_environment:
SGLANG_ENABLE_SPEC_V2: "1"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"

sglang_config:
aggregated:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"

attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"

tensor-parallel-size: 4

mamba-ssm-dtype: "bfloat16"
moe-runner-backend: "flashinfer_trtllm"

speculative-algorithm: "NEXTN"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mamba-scheduler-strategy: "extra_buffer"
disable-radix-cache: false
max-running-requests: 128
mem-fraction-static: 0.8
chunked-prefill-size: 16384
max-prefill-tokens: 16384
cuda-graph-max-bs: 128

decode-log-interval: 1
stream-interval: 50

benchmark:
type: "gsm8k"
num_examples: 1319
max_tokens: 16000
num_threads: 128
num_shots: 8
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "qwen3.5-agg-dep4"

model:
path: "qwen3.5-fp8"
container: "dev" # docker://lmsysorg/sglang:dev
container: "dev"
precision: "fp8"

resources:
Expand All @@ -30,7 +30,6 @@ backend:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"


attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
Expand All @@ -53,11 +52,17 @@ backend:
max-prefill-tokens: 16384
context-length: 2020
cuda-graph-max-bs: 1024
# enable-symm-mem: true
enable-symm-mem: true

decode-log-interval: 1
stream-interval: 50

profiling:
type: "torch"
aggregated:
start_step: 10
stop_step: 20

benchmark:
type: "sa-bench"
isl: 1000
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "qwen3.5-agg-tep4"

model:
path: "qwen3.5-fp8"
container: "dev" # docker://lmsysorg/sglang:dev
container: "dev"
precision: "fp8"

resources:
Expand All @@ -30,7 +30,6 @@ backend:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"


attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
Expand All @@ -50,11 +49,17 @@ backend:
max-prefill-tokens: 16384
context-length: 2020
cuda-graph-max-bs: 1024
# enable-symm-mem: true
enable-symm-mem: true

decode-log-interval: 1
stream-interval: 50

profiling:
type: "torch"
aggregated:
start_step: 10
stop_step: 20

benchmark:
type: "sa-bench"
isl: 1000
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "qwen3.5-agg-tp4"

model:
path: "qwen3.5-fp8"
container: "dev" # docker://lmsysorg/sglang:dev
container: "dev"
precision: "fp8"

resources:
Expand All @@ -29,7 +29,6 @@ backend:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"


attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
Expand All @@ -46,11 +45,17 @@ backend:
max-prefill-tokens: 16384
context-length: 2020
cuda-graph-max-bs: 1024
# enable-symm-mem: true
enable-symm-mem: true

decode-log-interval: 1
stream-interval: 50

profiling:
type: "torch"
aggregated:
start_step: 10
stop_step: 20

benchmark:
type: "sa-bench"
isl: 1000
Expand Down
Loading
Loading