Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions modelopt_recipes/models/Nemotron-H/nvfp4_w4a16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Mixed-precision PTQ recipe for Nemotron-H hybrid models (Mamba-2 + MLP + Attention).
#
# Precision assignment mirrors the GGUF Q4_K_M quantization profile:
# Q6_K MLP down_proj → FP8 W8A8 (layers 1, 3, 5, 8, 10, 18, 25, 33, 41)
# All other linears → NVFP4 W4A4
# lm_head, embedding → NVFP4 W4A16 (weight-only)
# Mamba in_proj → bf16 (output dim 17504 not divisible by Marlin tile 64)
# Mamba conv1d → bf16

imports:
base_disable_all: configs/ptq/units/base_disable_all
w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
fp8: configs/numerics/fp8
nvfp4: configs/numerics/nvfp4

metadata:
recipe_type: ptq
description: >-
Mixed-precision Nemotron-H recipe: FP8 W8A8 for Q6_K MLP down_proj layers,
NVFP4 W4A4 for all other linears, NVFP4 W4A16 for lm_head and embedding.

quantize:
algorithm: max
quant_cfg:
- $import: base_disable_all
- $import: w4a4_nvfp4_nvfp4

# Q6_K MLP down_proj layers → FP8 W8A8 (override NVFP4 W4A4 above)
- quantizer_name: 'backbone.layers.1.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.1.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.3.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.3.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.5.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.5.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.8.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.8.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.10.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.10.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.18.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.18.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.25.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.25.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.33.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.33.mixer.down_proj*input_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.41.mixer.down_proj*weight_quantizer'
cfg:
$import: fp8
- quantizer_name: 'backbone.layers.41.mixer.down_proj*input_quantizer'
cfg:
$import: fp8

# Standard exclusions (lm_head, embedding, conv1d, BatchNorm, etc.)
- $import: default_disabled_quantizers

# Mamba in_proj stays at bf16 — output dim 17504 not divisible by Marlin tile (64).
- quantizer_name: '*mixer.in_proj*'
enable: false

# NVFP4 W4A16 for lm_head and embedding (re-enable weight-only after default_disabled_quantizers)
- quantizer_name: '*lm_head*weight_quantizer'
enable: true
cfg:
$import: nvfp4
- quantizer_name: '*embeddings*weight_quantizer'
enable: true
cfg:
$import: nvfp4
Loading