ishandhanani · YAMY1234 · Apr 7, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml b/recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml
diff --git a/recipes/qwen3.5/fp8/agg/mtp_radix_off/tp4-mtp-acc.yaml b/recipes/qwen3.5/fp8/agg/mtp_radix_off/tp4-mtp-acc.yaml
@@ -0,0 +1,63 @@
+# Qwen3.5-397B-A17B-FP8 Aggregated TP4 + MTP Accuracy Verification (GSM8K)
+# NEXTN MTP speculative decoding, radix cache OFF
+
+name: "qwen3.5-agg-tp4-mtp-acc"
+
+model:
+  path: "qwen3.5-fp8"
+  container: "dev"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    PYTHONUNBUFFERED: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
+      model-path: "/model/"
+
+      attention-backend: "trtllm_mha"
+      quantization: "fp8"
+      kv-cache-dtype: "fp8_e4m3"
+
+      tensor-parallel-size: 4
+
+      mamba-ssm-dtype: "bfloat16"
+      moe-runner-backend: "flashinfer_trtllm"
+
+      speculative-algorithm: "NEXTN"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+      disable-radix-cache: true
+      mamba-scheduler-strategy: "no_buffer"
+      max-running-requests: 128
+      mem-fraction-static: 0.8
+      chunked-prefill-size: 16384
+      max-prefill-tokens: 16384
+      cuda-graph-max-bs: 128
+
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "gsm8k"
+  num_examples: 1319
+  max_tokens: 16000
+  num_threads: 128
+  num_shots: 8
diff --git a/recipes/qwen3.5/fp8/agg/mtp_radix_on/tp4-mtp-acc.yaml b/recipes/qwen3.5/fp8/agg/mtp_radix_on/tp4-mtp-acc.yaml
@@ -0,0 +1,63 @@
+# Qwen3.5-397B-A17B-FP8 Aggregated TP4 + MTP Accuracy Verification (GSM8K)
+# NEXTN MTP speculative decoding, radix cache ON (prefix caching enabled)
+
+name: "qwen3.5-agg-tp4-mtp-radix-acc"
+
+model:
+  path: "qwen3.5-fp8"
+  container: "dev"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    PYTHONUNBUFFERED: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
+      model-path: "/model/"
+
+      attention-backend: "trtllm_mha"
+      quantization: "fp8"
+      kv-cache-dtype: "fp8_e4m3"
+
+      tensor-parallel-size: 4
+
+      mamba-ssm-dtype: "bfloat16"
+      moe-runner-backend: "flashinfer_trtllm"
+
+      speculative-algorithm: "NEXTN"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+
+      mamba-scheduler-strategy: "extra_buffer"
+      disable-radix-cache: false
+      max-running-requests: 128
+      mem-fraction-static: 0.8
+      chunked-prefill-size: 16384
+      max-prefill-tokens: 16384
+      cuda-graph-max-bs: 128
+
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "gsm8k"
+  num_examples: 1319
+  max_tokens: 16000
+  num_threads: 128
+  num_shots: 8
diff --git a/recipes/qwen3.5/nixl/agg-dep4.yaml → ...3.5/fp8/agg/profile/agg-dep4-profile.yaml b/recipes/qwen3.5/nixl/agg-dep4.yaml → ...3.5/fp8/agg/profile/agg-dep4-profile.yaml
@@ -5,7 +5,7 @@ name: "qwen3.5-agg-dep4"
 
 model:
   path: "qwen3.5-fp8"
-  container: "dev"  # docker://lmsysorg/sglang:dev
+  container: "dev"
   precision: "fp8"
 
 resources:
@@ -30,7 +30,6 @@ backend:
       served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
       model-path: "/model/"
 
-
       attention-backend: "trtllm_mha"
       quantization: "fp8"
       kv-cache-dtype: "fp8_e4m3"
@@ -53,11 +52,17 @@ backend:
       max-prefill-tokens: 16384
       context-length: 2020
       cuda-graph-max-bs: 1024
-      # enable-symm-mem: true
+      enable-symm-mem: true
 
       decode-log-interval: 1
       stream-interval: 50
 
+profiling:
+  type: "torch"
+  aggregated:
+    start_step: 10
+    stop_step: 20
+
 benchmark:
   type: "sa-bench"
   isl: 1000

diff --git a/recipes/qwen3.5/nixl/agg-tep4.yaml → ...3.5/fp8/agg/profile/agg-tep4-profile.yaml b/recipes/qwen3.5/nixl/agg-tep4.yaml → ...3.5/fp8/agg/profile/agg-tep4-profile.yaml
@@ -5,7 +5,7 @@ name: "qwen3.5-agg-tep4"
 
 model:
   path: "qwen3.5-fp8"
-  container: "dev"  # docker://lmsysorg/sglang:dev
+  container: "dev"
   precision: "fp8"
 
 resources:
@@ -30,7 +30,6 @@ backend:
       served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
       model-path: "/model/"
 
-
       attention-backend: "trtllm_mha"
       quantization: "fp8"
       kv-cache-dtype: "fp8_e4m3"
@@ -50,11 +49,17 @@ backend:
       max-prefill-tokens: 16384
       context-length: 2020
       cuda-graph-max-bs: 1024
-      # enable-symm-mem: true
+      enable-symm-mem: true
 
       decode-log-interval: 1
       stream-interval: 50
 
+profiling:
+  type: "torch"
+  aggregated:
+    start_step: 10
+    stop_step: 20
+
 benchmark:
   type: "sa-bench"
   isl: 1000

diff --git a/recipes/qwen3.5/nixl/agg-tp4.yaml → ...n3.5/fp8/agg/profile/agg-tp4-profile.yaml b/recipes/qwen3.5/nixl/agg-tp4.yaml → ...n3.5/fp8/agg/profile/agg-tp4-profile.yaml
@@ -5,7 +5,7 @@ name: "qwen3.5-agg-tp4"
 
 model:
   path: "qwen3.5-fp8"
-  container: "dev"  # docker://lmsysorg/sglang:dev
+  container: "dev"
   precision: "fp8"
 
 resources:
@@ -29,7 +29,6 @@ backend:
       served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
       model-path: "/model/"
 
-
       attention-backend: "trtllm_mha"
       quantization: "fp8"
       kv-cache-dtype: "fp8_e4m3"
@@ -46,11 +45,17 @@ backend:
       max-prefill-tokens: 16384
       context-length: 2020
       cuda-graph-max-bs: 1024
-      # enable-symm-mem: true
+      enable-symm-mem: true
 
       decode-log-interval: 1
       stream-interval: 50
 
+profiling:
+  type: "torch"
+  aggregated:
+    start_step: 10
+    stop_step: 20
+
 benchmark:
   type: "sa-bench"
   isl: 1000