diff --git a/recipes/qwen3.6/qwen3.6-35b-a3b-nvfp4-atlas.yaml b/recipes/qwen3.6/qwen3.6-35b-a3b-nvfp4-atlas.yaml new file mode 100644 index 0000000..f92584b --- /dev/null +++ b/recipes/qwen3.6/qwen3.6-35b-a3b-nvfp4-atlas.yaml @@ -0,0 +1,29 @@ +recipe_version: "2" +model: RedHatAI/Qwen3.6-35B-A3B-NVFP4 +runtime: atlas +container: avarok/atlas-gb10:latest +max_nodes: 1 + +metadata: + description: | + Qwen3.6-35B-A3B (RedHatAI NVFP4) with MTP K=2 on the Atlas runtime. + Mirrors the qwen3.5-35b-a3b-nvfp4 recipe but uses the qwen3_5_moe + architecture from Qwen3.6, served on a single GB10. NVFP4 weights + + NVFP4 KV cache + NVFP4 MTP-quantized draft head. + maintainer: avarok + category: agent + model_params: 35B + model_dtype: nvfp4 + quantization: nvfp4 + kv_dtype: nvfp4 + +defaults: + port: 8888 + host: 0.0.0.0 + max_model_len: 131072 + kv_cache_dtype: nvfp4 + gpu_memory_utilization: 0.88 + scheduling_policy: slai + speculative: true + mtp_quantization: nvfp4 + enable_prefix_caching: true