NVIDIA · ajrasane · Jun 5, 2026
@@ -0,0 +1,115 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Mixed-precision PTQ recipe for Nemotron-H hybrid models (Mamba-2 + MLP + Attention).
+#
+# Precision assignment mirrors the GGUF Q4_K_M quantization profile:
+#   Q6_K MLP down_proj  →  FP8 W8A8  (layers 1, 3, 5, 8, 10, 18, 25, 33, 41)
+#   All other linears   →  NVFP4 W4A4
+#   lm_head, embedding  →  NVFP4 W4A16 (weight-only)
+#   Mamba in_proj       →  bf16  (output dim 17504 not divisible by Marlin tile 64)
+#   Mamba conv1d        →  bf16
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  fp8: configs/numerics/fp8
+  nvfp4: configs/numerics/nvfp4
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    Mixed-precision Nemotron-H recipe: FP8 W8A8 for Q6_K MLP down_proj layers,
+    NVFP4 W4A4 for all other linears, NVFP4 W4A16 for lm_head and embedding.
+
+quantize:
+  algorithm: max
+  quant_cfg:
+    - $import: base_disable_all
+    - $import: w4a4_nvfp4_nvfp4
+
+    # Q6_K MLP down_proj layers → FP8 W8A8 (override NVFP4 W4A4 above)
+    - quantizer_name: 'backbone.layers.1.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.1.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.3.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.3.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.5.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.5.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.8.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.8.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.10.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.10.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.18.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.18.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.25.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.25.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.33.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.33.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.41.mixer.down_proj*weight_quantizer'
+      cfg:
+        $import: fp8
+    - quantizer_name: 'backbone.layers.41.mixer.down_proj*input_quantizer'
+      cfg:
+        $import: fp8
+
+    # Standard exclusions (lm_head, embedding, conv1d, BatchNorm, etc.)
+    - $import: default_disabled_quantizers
+
+    # Mamba in_proj stays at bf16 — output dim 17504 not divisible by Marlin tile (64).
+    - quantizer_name: '*mixer.in_proj*'
+      enable: false
+
+    # NVFP4 W4A16 for lm_head and embedding (re-enable weight-only after default_disabled_quantizers)
+    - quantizer_name: '*lm_head*weight_quantizer'
+      enable: true
+      cfg:
+        $import: nvfp4
+    - quantizer_name: '*embeddings*weight_quantizer'
+      enable: true
+      cfg:
+        $import: nvfp4