diff --git a/examples/diffusers/quantization/config.py b/examples/diffusers/quantization/config.py index e15b8c7ba3c..7b472565a69 100644 --- a/examples/diffusers/quantization/config.py +++ b/examples/diffusers/quantization/config.py @@ -16,82 +16,21 @@ import torch.nn as nn from calib.plugin_calib import PercentileCalibrator -FP8_DEFAULT_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*output_quantizer", "enable": False}, - {"quantizer_name": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - ], - "algorithm": "max", -} +from modelopt.torch.opt.config_loader import load_config +from modelopt.torch.quantization.config import QuantizeConfig -INT8_DEFAULT_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, - {"quantizer_name": "*output_quantizer", "enable": False}, - ], - "algorithm": "max", -} - -NVFP4_DEFAULT_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - {"quantizer_name": "*output_quantizer", "enable": False}, - {"quantizer_name": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - ], - "algorithm": "max", -} - -NVFP4_FP8_MHA_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - { - "quantizer_name": "**weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - { - "quantizer_name": "**input_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - {"quantizer_name": "*output_quantizer", "enable": False}, - {"quantizer_name": "*[qkv]_bmm_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*bmm2_output_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - ], - "algorithm": {"method": "svdquant", "lowrank": 32}, -} +FP8_DEFAULT_CONFIG = load_config( + "configs/ptq/presets/diffusers/fp8", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) +INT8_DEFAULT_CONFIG = load_config( + "configs/ptq/presets/diffusers/int8", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) +NVFP4_DEFAULT_CONFIG = load_config( + "configs/ptq/presets/diffusers/nvfp4", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) +NVFP4_FP8_MHA_CONFIG = load_config( + "configs/ptq/presets/diffusers/nvfp4_fp8_mha", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) def set_quant_config_attr(quant_config, trt_high_precision_dtype, quant_algo, **kwargs): diff --git a/examples/diffusers/quantization/quantize.py b/examples/diffusers/quantization/quantize.py index 2a3c947a2d6..c719fbd45cc 100644 --- a/examples/diffusers/quantization/quantize.py +++ b/examples/diffusers/quantization/quantize.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse +import copy import logging import sys import time as time @@ -114,19 +115,13 @@ def get_quant_config(self, n_steps: int, backbone: torch.nn.Module) -> Any: """ self.logger.info(f"Building quantization config for {self.config.format.value}") + apply_int8_percentile_calibrator = False if self.config.format == QuantFormat.INT8: if self.config.algo == QuantAlgo.SMOOTHQUANT: base_cfg = mtq.INT8_SMOOTHQUANT_CFG else: base_cfg = INT8_DEFAULT_CONFIG - if self.config.collect_method != CollectMethod.DEFAULT: - reset_set_int8_config( - base_cfg, - self.config.percentile, - n_steps, - collect_method=self.config.collect_method.value, - backbone=backbone, - ) + apply_int8_percentile_calibrator = self.config.collect_method != CollectMethod.DEFAULT elif self.config.format == QuantFormat.FP8: base_cfg = FP8_DEFAULT_CONFIG elif self.config.format == QuantFormat.FP4: @@ -137,7 +132,18 @@ def get_quant_config(self, n_steps: int, backbone: torch.nn.Module) -> Any: else: raise NotImplementedError(f"Unknown format {self.config.format}") - # Build a fresh config dict so we never mutate the global constants. + # Build a fresh config dict so runtime overrides never mutate the global constants. + base_cfg = copy.deepcopy(base_cfg) + + if apply_int8_percentile_calibrator: + reset_set_int8_config( + base_cfg, + self.config.percentile, + n_steps, + collect_method=self.config.collect_method.value, + backbone=backbone, + ) + quant_cfg_list = list(base_cfg["quant_cfg"]) if self.config.format == QuantFormat.FP4: diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index ebd7c1090bb..db35e4841fb 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -15,6 +15,7 @@ import argparse from collections import defaultdict +from typing import Any import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -24,7 +25,7 @@ from modelopt.torch.utils import create_forward_loop from modelopt.torch.utils.dataset_utils import get_dataset_dataloader -SUPPORT_QUANT_FORMAT = { +SUPPORT_QUANT_FORMAT: dict[str, dict[str, Any]] = { "fp8": mtq.FP8_DEFAULT_CFG, "nvfp4": mtq.NVFP4_DEFAULT_CFG, } @@ -87,7 +88,7 @@ def loss_func(output, data): data_loader=calib_dataloader, forward_step=lambda model, batch: model(**batch), loss_func=loss_func, - quantization_formats=[SUPPORT_QUANT_FORMAT[format] for format in qformat_list], + quantization_formats=[SUPPORT_QUANT_FORMAT[quant_format] for quant_format in qformat_list], num_calib_steps=len(calib_dataloader), num_score_steps=min( len(calib_dataloader), 128 // batch_size diff --git a/modelopt/torch/opt/config_loader.py b/modelopt/torch/opt/config_loader.py index 76ed2bb6503..80864523e52 100644 --- a/modelopt/torch/opt/config_loader.py +++ b/modelopt/torch/opt/config_loader.py @@ -336,7 +336,19 @@ def _schema_equal(left: Any | None, right: Any | None) -> bool: def _list_element_schema(schema_type: Any | None) -> Any | None: """Return the element schema for a typed ``list[T]`` annotation.""" schema_type = _unwrap_schema_type(schema_type) - if get_origin(schema_type) is not list: + origin = get_origin(schema_type) + if origin in (UnionType, Union): + element_schemas = [] + for arg in get_args(schema_type): + if arg is NoneType: + continue + element_schema = _list_element_schema(arg) + if element_schema is None: + continue + if not any(_schema_equal(element_schema, seen) for seen in element_schemas): + element_schemas.append(element_schema) + return element_schemas[0] if len(element_schemas) == 1 else None + if origin is not list: return None args = get_args(schema_type) if len(args) != 1 or args[0] is Any: @@ -510,6 +522,12 @@ def _resolve_list_import( if _schema_equal(imported.schema_type, element_schema): return [imported.data] + element_schema_unwrapped = _unwrap_schema_type(element_schema) + if isinstance(imported.data, dict) and ( + element_schema_unwrapped is dict or get_origin(element_schema_unwrapped) is dict + ): + return [imported.data] + raise ValueError( f"$import {ref_name!r} in list at {context} has schema " f"{_schema_label(imported.schema_type, imported.schema)!r}; expected either " diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index b0c3fb859b2..fd95171ce43 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -150,7 +150,6 @@ """ -import copy import warnings from collections.abc import Mapping, Sequence from typing import Any, Literal @@ -1199,578 +1198,141 @@ class _QuantizeExportConfig(ModeloptBaseConfig): """An empty config.""" -# Shared snippet constants are dumped back to plain dicts before being spliced into -# the public quant config constants below. ``load_config`` returns validated -# ``QuantizerCfgEntry`` instances for schema-tagged files, but the public constants -# (``INT4_AWQ_CFG``, ``NVFP4_DEFAULT_CFG``, etc.) have always been raw dict/list trees; -# splatting schema instances into them would surprise callers that serialise the -# constants or do ``isinstance(entry, dict)`` checks. ``exclude_unset=True`` keeps the -# sparse YAML shape (only the explicitly set fields) so the dumped dicts are -# byte-identical to what authors wrote in the YAML snippets. -_base_disable_all: list[dict[str, Any]] = [ - load_config("configs/ptq/units/base_disable_all").model_dump(exclude_unset=True) -] - -_default_disabled_quantizer_cfg: list[dict[str, Any]] = [ - entry.model_dump(exclude_unset=True) - for entry in load_config("configs/ptq/units/default_disabled_quantizers") -] - -_mamba_moe_disabled_quantizer_cfg: list[dict[str, Any]] = [ - {"quantizer_name": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE - {"quantizer_name": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE - {"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear (HF naming) - {"quantizer_name": "*k_proj*", "enable": False}, # Skip QKV Linear (HF naming) - {"quantizer_name": "*v_proj*", "enable": False}, # Skip QKV Linear (HF naming) - {"quantizer_name": "*o_proj*", "enable": False}, # Skip QKV Output Projection (HF naming) - { - "quantizer_name": "*self_attention.linear_qkv*", - "enable": False, - }, # Skip QKV Linear (Mcore naming) - { - "quantizer_name": "*self_attention.linear_proj*", - "enable": False, - }, # Skip QKV Output Projection (Mcore naming) -] - -INT8_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -INT8_SMOOTHQUANT_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "smoothquant", -} - -INT8_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -FP8_DEFAULT_CFG: dict[str, Any] = load_config("configs/ptq/presets/model/fp8").model_dump( - exclude_unset=True -) - -MAMBA_MOE_FP8_AGGRESSIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -MAMBA_MOE_FP8_CONSERVATIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - {"quantizer_name": "*mixer.in_proj*", "enable": False}, # Skip mamba linear - {"quantizer_name": "*mixer.out_proj*", "enable": False}, # Skip mamba linear - ], - "algorithm": "max", -} +def _load_quantizer_attribute_dict(config_path: str) -> dict[str, Any]: + """Load a schema-backed QuantizerAttributeConfig YAML as a public dict.""" + config = load_config(config_path, schema_type=QuantizerAttributeConfig) + if isinstance(config, QuantizerAttributeConfig): + return config.model_dump(exclude_unset=True) + if isinstance(config, Mapping): + return dict(config) + raise TypeError(f"{config_path} must declare QuantizerAttributeConfig.") -FP8_PER_CHANNEL_PER_TOKEN_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": 0}}, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - "type": "dynamic", - "block_sizes": {-1: None}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} -# FP8 2D blockwise fake quantization config for deepseek models -FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (4, 3), - "block_sizes": {-1: 128, -2: 128}, - }, - }, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} +def _load_quantize_config_dict(config_path: str) -> dict[str, Any]: + """Load a schema-backed QuantizeConfig YAML as a public legacy-shape dict.""" + return load_config(config_path, schema_type=QuantizeConfig).model_dump(exclude_unset=True) -INT4_BLOCKWISE_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": 4, - "block_sizes": {-1: 128}, - }, - }, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} +def _load_quantizer_cfg_dict_list(config_path: str) -> list[dict[str, Any]]: + """Load a QuantizerCfgEntry or QuantizerCfgListConfig snippet as public dict entries.""" + config = load_config(config_path) + entries = config if isinstance(config, list) else [config] + return [e.model_dump(exclude_unset=True) for e in entries] -INT4_AWQ_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - }, - }, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, - # "algorithm": {"method": "awq_full", "alpha_step": 0.1, "max_co_batch_size": 1024}, - # "algorithm": {"method": "awq_clip", "max_co_batch_size": 2048}, -} -# W4A8 currently uses INT4 blockwise quantization (block size = 128) followed by FP8 quantization -# for weights. This could change in the future -W4A8_AWQ_BETA_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": [ - { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - }, - { - "num_bits": (4, 3), - }, - ], - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "awq_lite", -} +_base_disable_all: list[dict[str, Any]] = _load_quantizer_cfg_dict_list( + "configs/ptq/units/base_disable_all" +) -MXFP8_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (4, 3), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_default_disabled_quantizer_cfg: list[dict[str, Any]] = _load_quantizer_cfg_dict_list( + "configs/ptq/units/default_disabled_quantizers" +) -MXFP6_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (3, 2), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (3, 2), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_mamba_moe_disabled_quantizer_cfg: list[dict[str, Any]] = _load_quantizer_cfg_dict_list( + "configs/ptq/units/mamba_moe_disabled_quantizers" +) -MXFP4_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_nvfp4_cfg: dict[str, Any] = _load_quantizer_attribute_dict("configs/numerics/nvfp4") -W4A8_MXFP4_FP8_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_nvfp4_cfg_bs32: dict[str, Any] = _load_quantizer_attribute_dict("configs/numerics/nvfp4_bs32") -MXINT8_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": 8, - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": 8, - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +INT8_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/int8") +INT8_SMOOTHQUANT_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/int8_smoothquant" +) +INT8_WEIGHT_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/int8_weight_only" +) +FP8_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/fp8") +MAMBA_MOE_FP8_AGGRESSIVE_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/mamba_moe_fp8_aggressive" +) +MAMBA_MOE_FP8_CONSERVATIVE_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/mamba_moe_fp8_conservative" +) +FP8_PER_CHANNEL_PER_TOKEN_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/fp8_per_channel_per_token" +) +FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/fp8_2d_blockwise_weight_only" +) +INT4_BLOCKWISE_WEIGHT_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/int4_blockwise_weight_only" +) +INT4_AWQ_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/int4_awq") +W4A8_AWQ_BETA_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/w4a8_awq_beta" +) +MXFP8_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/mxfp8") +MXFP6_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/mxfp6") +MXFP4_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/mxfp4") +W4A8_MXFP4_FP8_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/w4a8_mxfp4_fp8" +) +MXINT8_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/mxint8") # KV-cache configs are designed to be merged with a primary quantization config (e.g. # FP8_DEFAULT_CFG) that already contains _base_disable_all. They intentionally omit both # _base_disable_all and "algorithm" because these are provided by the primary config. -FP8_KV_CFG: dict[str, Any] = load_config("configs/ptq/presets/kv/fp8").model_dump( - exclude_unset=True -) - -FP8_AFFINE_KV_CFG = { - "quant_cfg": [ - { - "quantizer_name": "*[kv]_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - "bias": {-2: None, -4: None, "type": "static"}, - }, - }, - ] -} - -_nvfp4_cfg = { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, -} - -_nvfp4_cfg_bs32 = { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, -} +FP8_KV_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/kv/fp8") +FP8_AFFINE_KV_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/kv/fp8_affine") - -def _nvfp4_selective_quant_cfg( - layer_patterns: list[str], - *, - quantizer: dict = _nvfp4_cfg, - weight_only: bool = False, - algorithm: str | dict = "max", -) -> dict: - """Build an NVFP4 config that quantizes only the specified layer patterns.""" - quant_cfg: list[dict[str, Any]] = [] - quant_cfg.extend(_base_disable_all) - for pattern in layer_patterns: - # Deep-copy the quantizer dict so each config constant gets its own instance. - quant_cfg.append( - {"quantizer_name": f"{pattern}weight_quantizer", "cfg": copy.deepcopy(quantizer)} - ) - if not weight_only: - quant_cfg.append( - {"quantizer_name": f"{pattern}input_quantizer", "cfg": copy.deepcopy(quantizer)} - ) - quant_cfg.extend(_default_disabled_quantizer_cfg) - return {"quant_cfg": quant_cfg, "algorithm": algorithm} - - -NVFP4_DEFAULT_CFG = _nvfp4_selective_quant_cfg(["*"]) - -NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - }, - }, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": { - "method": "mse", - "fp8_scale_sweep": True, - }, -} - -NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - }, - }, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": { - "method": "local_hessian", - "fp8_scale_sweep": True, - }, -} - -MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - ], - "algorithm": "max", -} -MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - {"quantizer_name": "*mixer.in_proj*", "enable": False}, # Skip mamba linear - {"quantizer_name": "*mixer.out_proj*", "enable": False}, # Skip mamba linear - ], - "algorithm": "max", -} - -NVFP4_AWQ_LITE_CFG = _nvfp4_selective_quant_cfg(["*"], algorithm="awq_lite") - -NVFP4_AWQ_CLIP_CFG = _nvfp4_selective_quant_cfg(["*"], algorithm={"method": "awq_clip"}) - -NVFP4_AWQ_FULL_CFG = _nvfp4_selective_quant_cfg( - ["*"], algorithm={"method": "awq_full", "alpha_step": 0.1} +NVFP4_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/model/nvfp4") +NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep" ) - -# See comment above FP8_KV_CFG — KV-cache configs omit _base_disable_all and "algorithm". -NVFP4_AFFINE_KV_CFG = { - "quant_cfg": [ - { - "quantizer_name": "*[kv]_bmm_quantizer", - "cfg": { - **_nvfp4_cfg, - "bias": {-2: None, -4: None, "type": "static"}, - }, - }, - ] -} - -NVFP4_KV_CFG = { - "quant_cfg": [ - {"quantizer_name": "*[kv]_bmm_quantizer", "cfg": _nvfp4_cfg}, - ] -} - -# Moved from examples/diffusers/quantization/config.py to here -NVFP4_FP8_MHA_CONFIG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*output_quantizer", "enable": False}, - { - "quantizer_name": "*q_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "*k_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "*v_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "*softmax_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "transformer_blocks*bmm2_output_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - ], - "algorithm": "max", -} - -# See comment above FP8_KV_CFG — KV-cache configs omit _base_disable_all and "algorithm". -NVFP4_KV_ROTATE_CFG = { - "quant_cfg": [ - { - # q_bmm is disabled but pre-configured with rotate=True so that downstream - # code can inspect the rotate flag even while the quantizer is off. - "quantizer_name": "*q_bmm_quantizer", - "cfg": { - "rotate": True, - }, - "enable": False, - }, - { - "quantizer_name": "*k_bmm_quantizer", - "cfg": { - **_nvfp4_cfg, - "rotate": True, - }, - }, - {"quantizer_name": "*v_bmm_quantizer", "cfg": _nvfp4_cfg}, - ], - "algorithm": "max", -} - -NVFP4_SVDQUANT_DEFAULT_CFG = _nvfp4_selective_quant_cfg( - ["*"], algorithm={"method": "svdquant", "lowrank": 32} +NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian" ) - -W4A8_NVFP4_FP8_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} -W4A16_NVFP4_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True) - -MXFP4_MLP_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*mlp*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*block_sparse_moe*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} - -NVFP4_MLP_WEIGHT_ONLY_CFG = _nvfp4_selective_quant_cfg( - ["*mlp*", "*block_sparse_moe*"], quantizer=_nvfp4_cfg_bs32, weight_only=True +MAMBA_MOE_NVFP4_AGGRESSIVE_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/mamba_moe_nvfp4_aggressive" +) +MAMBA_MOE_NVFP4_CONSERVATIVE_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/mamba_moe_nvfp4_conservative" +) +NVFP4_AWQ_LITE_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_awq_lite" +) +NVFP4_AWQ_CLIP_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_awq_clip" +) +NVFP4_AWQ_FULL_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_awq_full" +) +NVFP4_AFFINE_KV_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/kv/nvfp4_affine" +) +NVFP4_KV_CFG: dict[str, Any] = _load_quantize_config_dict("configs/ptq/presets/kv/nvfp4") +NVFP4_FP8_MHA_CONFIG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_fp8_mha" +) +NVFP4_KV_ROTATE_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/kv/nvfp4_rotate" +) +NVFP4_SVDQUANT_DEFAULT_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_svdquant" +) +W4A8_NVFP4_FP8_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/w4a8_nvfp4_fp8" +) +W4A16_NVFP4_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/w4a16_nvfp4" +) +MXFP4_MLP_WEIGHT_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/mxfp4_mlp_weight_only" +) +NVFP4_MLP_WEIGHT_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_mlp_weight_only" +) +NVFP4_EXPERTS_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_experts_only" +) +NVFP4_MLP_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_mlp_only" ) -NVFP4_EXPERTS_ONLY_CFG = _nvfp4_selective_quant_cfg( - ["*mlp.experts*", "*block_sparse_moe*", "*.experts.*"] +NVFP4_OMLP_ONLY_CFG: dict[str, Any] = _load_quantize_config_dict( + "configs/ptq/presets/model/nvfp4_omlp_only" ) -NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*", "*.experts.*"]) -NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file @@ -1786,6 +1348,7 @@ def _nvfp4_selective_quant_cfg( "INT8_SMOOTHQUANT_CFG", "INT8_WEIGHT_ONLY_CFG", "MXFP4_DEFAULT_CFG", + "MXFP6_DEFAULT_CFG", "MXFP8_DEFAULT_CFG", "MXINT8_DEFAULT_CFG", "NVFP4_AFFINE_KV_CFG", @@ -1810,6 +1373,7 @@ def _nvfp4_selective_quant_cfg( "MAMBA_MOE_NVFP4_AGGRESSIVE_CFG", "MAMBA_MOE_FP8_CONSERVATIVE_CFG", "MAMBA_MOE_FP8_AGGRESSIVE_CFG", + "NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG", "NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG", } diff --git a/modelopt_recipes/configs/numerics/fp8.yaml b/modelopt_recipes/configs/numerics/fp8.yaml index ab1da6fad5f..7761dd106c0 100644 --- a/modelopt_recipes/configs/numerics/fp8.yaml +++ b/modelopt_recipes/configs/numerics/fp8.yaml @@ -13,9 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 quantizer attributes (per-tensor; used for weight/activation/KV). -# ``axis: null`` is explicit to match the hardcoded ``FP8_DEFAULT_CFG`` shape — -# downstream code that keys on ``"axis" in cfg`` sees the same dict layout. +# Per-tensor FP8 E4M3 quantizer attributes. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e4m3 diff --git a/modelopt_recipes/configs/numerics/int4_per_block.yaml b/modelopt_recipes/configs/numerics/int4_per_block.yaml new file mode 100644 index 00000000000..35d9f53a17a --- /dev/null +++ b/modelopt_recipes/configs/numerics/int4_per_block.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Static INT4 quantizer attributes with 128-value blocks on the last dimension. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 4 +block_sizes: + -1: 128 + type: static diff --git a/modelopt_recipes/configs/numerics/int8.yaml b/modelopt_recipes/configs/numerics/int8.yaml new file mode 100644 index 00000000000..41e8835c374 --- /dev/null +++ b/modelopt_recipes/configs/numerics/int8.yaml @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Per-tensor INT8 quantizer attributes. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 8 +axis: diff --git a/modelopt_recipes/configs/numerics/int8_per_channel.yaml b/modelopt_recipes/configs/numerics/int8_per_channel.yaml new file mode 100644 index 00000000000..31c10635fc4 --- /dev/null +++ b/modelopt_recipes/configs/numerics/int8_per_channel.yaml @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Per-channel INT8 quantizer attributes with axis 0. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 8 +axis: 0 diff --git a/modelopt_recipes/configs/numerics/mxfp4.yaml b/modelopt_recipes/configs/numerics/mxfp4.yaml new file mode 100644 index 00000000000..f32fde304f2 --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxfp4.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dynamic MXFP4 E2M1 block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e2m1 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/mxfp6.yaml b/modelopt_recipes/configs/numerics/mxfp6.yaml new file mode 100644 index 00000000000..f8849edd294 --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxfp6.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dynamic MXFP6 E3M2 block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e3m2 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/mxfp8.yaml b/modelopt_recipes/configs/numerics/mxfp8.yaml new file mode 100644 index 00000000000..46cb3d9f7c7 --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxfp8.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dynamic MXFP8 E4M3 block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e4m3 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/mxint8.yaml b/modelopt_recipes/configs/numerics/mxint8.yaml new file mode 100644 index 00000000000..388b251de67 --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxint8.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dynamic MXINT8 block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 8 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/nvfp4.yaml b/modelopt_recipes/configs/numerics/nvfp4.yaml index 68629c009fb..88598e36e85 100644 --- a/modelopt_recipes/configs/numerics/nvfp4.yaml +++ b/modelopt_recipes/configs/numerics/nvfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 E2M1 blockwise quantizer attributes with FP8 E4M3 scales (dynamic calibration, the default). +# Dynamic NVFP4 E2M1 block quantizer attributes with FP8 E4M3 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml b/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml new file mode 100644 index 00000000000..a84b63a91d3 --- /dev/null +++ b/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dynamic NVFP4 E2M1 block quantizer attributes with FP8 E4M3 scales and block size 32. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e2m1 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e4m3 diff --git a/modelopt_recipes/configs/numerics/nvfp4_static.yaml b/modelopt_recipes/configs/numerics/nvfp4_static.yaml index 32bd247b79a..9f6ac62e11e 100644 --- a/modelopt_recipes/configs/numerics/nvfp4_static.yaml +++ b/modelopt_recipes/configs/numerics/nvfp4_static.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 E2M1 blockwise quantizer attributes with FP8 E4M3 scales (used for NVFP4 weights since weight scales can be static). +# Static NVFP4 E2M1 block quantizer attributes with FP8 E4M3 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/ptq/presets/README.md b/modelopt_recipes/configs/ptq/presets/README.md index 3ab307fe453..2aeec2d2b33 100644 --- a/modelopt_recipes/configs/ptq/presets/README.md +++ b/modelopt_recipes/configs/ptq/presets/README.md @@ -1,8 +1,8 @@ # PTQ Preset Configs This directory holds preset quantization configurations that serve as the -YAML source of truth for the hardcoded `*_CFG` dicts in -`modelopt.torch.quantization.config` (e.g., `FP8_DEFAULT_CFG`, +YAML source of truth for the `*_CFG` `QuantizeConfig` constants exposed +from `modelopt.torch.quantization.config` (e.g., `FP8_DEFAULT_CFG`, `FP8_KV_CFG`). Presets compose from the reusable snippets in `configs/numerics/` and @@ -25,6 +25,10 @@ own imports have been resolved. be merged on top of a `model/` preset via `$import` to produce a complete config. Example: `kv/fp8.yaml` (the YAML source of `FP8_KV_CFG`). +- **`diffusers/`** — Diffusers-specific full quantization presets. These + files are complete configs used by the Diffusers examples, including + attention and softmax quantizer choices that differ from the generic + `model/` presets. **Note:** The main purpose of these presets is to support the existing `hf_ptq.py` script's `--qformat` / `--kv_cache_qformat` flags and other diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/fp8.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/fp8.yaml new file mode 100644 index 00000000000..7cb89a7bfed --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/fp8.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers FP8 preset with per-tensor E4M3 weights, inputs, and softmax quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + fp8: configs/numerics/fp8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*output_quantizer' + enable: false + - quantizer_name: '*softmax_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml new file mode 100644 index 00000000000..be12d717451 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers INT8 preset with per-channel weights and per-tensor inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + int8_per_channel: configs/numerics/int8_per_channel + int8: configs/numerics/int8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8_per_channel + - quantizer_name: '*input_quantizer' + cfg: + $import: int8 + - quantizer_name: '*output_quantizer' + enable: false diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4.yaml new file mode 100644 index 00000000000..691defb2ae2 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4.yaml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers NVFP4 preset with dynamic E2M1 block quantization and FP8 softmax. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + fp8: configs/numerics/fp8 + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*output_quantizer' + enable: false + - quantizer_name: '*softmax_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4_fp8_mha.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4_fp8_mha.yaml new file mode 100644 index 00000000000..ee8c2704fc5 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4_fp8_mha.yaml @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers Flux preset with dynamic NVFP4 weights/inputs and FP8 attention quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + attention_qkv_fp8: configs/ptq/units/attention_qkv_fp8 + fp8: configs/numerics/fp8 + nvfp4: configs/numerics/nvfp4 + +algorithm: + method: svdquant + lowrank: 32 +quant_cfg: + - $import: base_disable_all + - quantizer_name: '**weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '**input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*output_quantizer' + enable: false + - $import: attention_qkv_fp8 + - quantizer_name: '*bmm2_output_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml index 7e97f0bc77b..21894ef9c01 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml @@ -13,10 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 KV cache quantization preset. -# Equivalent to the hardcoded FP8_KV_CFG in config.py. -# This is a partial config (no algorithm, no base_disable_all) — designed -# to be merged with a primary model quantization config. +# Partial QuantizeConfig that enables FP8 E4M3 KV-cache quantizers. +# Merge this fragment with a primary model quantization preset. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml new file mode 100644 index 00000000000..4540df34ea9 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Partial QuantizeConfig that enables affine FP8 E4M3 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + kv_fp8_affine: configs/ptq/units/kv_fp8_affine + +quant_cfg: + - $import: kv_fp8_affine diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml new file mode 100644 index 00000000000..6d759e2c115 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Partial QuantizeConfig that enables NVFP4 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + kv_nvfp4: configs/ptq/units/kv_nvfp4 + +quant_cfg: + - $import: kv_nvfp4 diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml new file mode 100644 index 00000000000..1f2a871010b --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Partial QuantizeConfig that enables affine NVFP4 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + kv_nvfp4_affine: configs/ptq/units/kv_nvfp4_affine + +quant_cfg: + - $import: kv_nvfp4_affine diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml new file mode 100644 index 00000000000..2451ee1a359 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Partial QuantizeConfig that enables rotated NVFP4 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + kv_nvfp4_rotate: configs/ptq/units/kv_nvfp4_rotate + +algorithm: max +quant_cfg: + - $import: kv_nvfp4_rotate diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8.yaml index af80b57fe48..423904a6e18 100644 --- a/modelopt_recipes/configs/ptq/presets/model/fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/fp8.yaml @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 per-tensor weight and activation (W8A8), max calibration. -# Equivalent to the hardcoded FP8_DEFAULT_CFG in config.py. +# QuantizeConfig preset for W8A8 FP8 E4M3 with per-tensor weights and inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml new file mode 100644 index 00000000000..a8d6bbb03f8 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for FP8 E4M3 2D blockwise weight-only quantization. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: fp8 + block_sizes: + -1: 128 + -2: 128 + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml new file mode 100644 index 00000000000..98a42f49591 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for FP8 E4M3 per-channel weights and per-token dynamic inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: fp8 + axis: 0 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + type: dynamic + block_sizes: + -1: + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml new file mode 100644 index 00000000000..828aef7d06f --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for AWQ-lite INT4 weight-only quantization. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int4_per_block: configs/numerics/int4_per_block + +algorithm: + method: awq_lite + alpha_step: 0.1 +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int4_per_block + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml new file mode 100644 index 00000000000..f55351812c4 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for INT4 blockwise weight-only quantization. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int4_per_block: configs/numerics/int4_per_block + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int4_per_block + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8.yaml b/modelopt_recipes/configs/ptq/presets/model/int8.yaml new file mode 100644 index 00000000000..1bfc7b95f0c --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int8.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for INT8 per-channel weights and per-tensor inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int8_per_channel: configs/numerics/int8_per_channel + int8: configs/numerics/int8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8_per_channel + - quantizer_name: '*input_quantizer' + cfg: + $import: int8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml new file mode 100644 index 00000000000..d75522bfced --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for SmoothQuant INT8 per-channel weights and per-tensor inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int8_per_channel: configs/numerics/int8_per_channel + int8: configs/numerics/int8 + +algorithm: smoothquant +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8_per_channel + - quantizer_name: '*input_quantizer' + cfg: + $import: int8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml new file mode 100644 index 00000000000..cc475ab6103 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for INT8 per-channel weight-only quantization. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int8_per_channel: configs/numerics/int8_per_channel + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8_per_channel + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml new file mode 100644 index 00000000000..a556941c43d --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for FP8 W8A8 Mamba-MoE quantization with shared exclusions. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w8a8_fp8_fp8: configs/ptq/units/w8a8_fp8_fp8 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: w8a8_fp8_fp8 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml new file mode 100644 index 00000000000..f7f693ae673 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for FP8 W8A8 Mamba-MoE quantization with mixer projections disabled. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w8a8_fp8_fp8: configs/ptq/units/w8a8_fp8_fp8 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: w8a8_fp8_fp8 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers + - quantizer_name: '*mixer.in_proj*' + enable: false + - quantizer_name: '*mixer.out_proj*' + enable: false diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml new file mode 100644 index 00000000000..4ad8accce75 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 Mamba-MoE quantization with shared exclusions. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml new file mode 100644 index 00000000000..f7420bb7c07 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 Mamba-MoE quantization with mixer projections disabled. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers + - quantizer_name: '*mixer.in_proj*' + enable: false + - quantizer_name: '*mixer.out_proj*' + enable: false diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml new file mode 100644 index 00000000000..982e22144ec --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic MXFP4 block quantization on weights and inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp4: configs/numerics/mxfp4 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml new file mode 100644 index 00000000000..8d03600e872 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic MXFP4 block weight-only quantization on MLP/MoE layers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp4: configs/numerics/mxfp4 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: mxfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: mxfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml new file mode 100644 index 00000000000..e8d590f3848 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic MXFP6 block quantization on weights and inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp6: configs/numerics/mxfp6 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp6 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxfp6 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml new file mode 100644 index 00000000000..7cf2832311c --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic MXFP8 block quantization on weights and inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp8: configs/numerics/mxfp8 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxfp8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml b/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml new file mode 100644 index 00000000000..e6ef1ca3d06 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic MXINT8 block quantization on weights and inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxint8: configs/numerics/mxint8 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxint8 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxint8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml new file mode 100644 index 00000000000..ee74eebeccc --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on weights and inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml new file mode 100644 index 00000000000..4d1d0d5ee9b --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 quantization with AWQ clip calibration. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: + method: awq_clip +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml new file mode 100644 index 00000000000..d41046d7d37 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 quantization with full AWQ calibration. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: + method: awq_full + alpha_step: 0.1 +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml new file mode 100644 index 00000000000..70313afac08 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 quantization with AWQ-lite calibration. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: awq_lite +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml new file mode 100644 index 00000000000..fdd18dfe36b --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on expert layers only. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + block_sparse_moe_nvfp4: configs/ptq/units/block_sparse_moe_nvfp4 + experts_nvfp4: configs/ptq/units/experts_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: block_sparse_moe_nvfp4 + - $import: experts_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml new file mode 100644 index 00000000000..abebea4917b --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for Diffusers NVFP4 with FP8 attention quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + attention_qkv_fp8: configs/ptq/units/attention_qkv_fp8 + fp8: configs/numerics/fp8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - quantizer_name: '*output_quantizer' + enable: false + - $import: attention_qkv_fp8 + - quantizer_name: 'transformer_blocks*bmm2_output_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml new file mode 100644 index 00000000000..c5d36fd9236 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on MLP/MoE layers only. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + block_sparse_moe_nvfp4: configs/ptq/units/block_sparse_moe_nvfp4 + experts_nvfp4: configs/ptq/units/experts_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp*input_quantizer' + cfg: + $import: nvfp4 + - $import: block_sparse_moe_nvfp4 + - $import: experts_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml new file mode 100644 index 00000000000..952ea3a90db --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 block-size-32 weight-only quantization on MLP/MoE layers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4_bs32: configs/numerics/nvfp4_bs32 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: nvfp4_bs32 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4_bs32 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml new file mode 100644 index 00000000000..82bf401ea9f --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on output projections and MLP/MoE layers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + block_sparse_moe_nvfp4: configs/ptq/units/block_sparse_moe_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*o_proj*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*o_proj*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp*input_quantizer' + cfg: + $import: nvfp4 + - $import: block_sparse_moe_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml new file mode 100644 index 00000000000..5acb834db2c --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 quantization with SVDQuant low-rank calibration. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: + method: svdquant + lowrank: 32 +quant_cfg: + - $import: base_disable_all + - $import: w4a4_nvfp4_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml new file mode 100644 index 00000000000..ac6a3094b7c --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 with static weight scales from local-Hessian calibration. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + nvfp4_static: configs/numerics/nvfp4_static + +algorithm: + method: local_hessian + fp8_scale_sweep: true +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml new file mode 100644 index 00000000000..3ae22dbc3a6 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 W4A4 with static weight scales from MSE FP8-scale sweep. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + nvfp4_static: configs/numerics/nvfp4_static + +algorithm: + method: mse + fp8_scale_sweep: true +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a16_nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a16_nvfp4.yaml new file mode 100644 index 00000000000..7a189858c65 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/w4a16_nvfp4.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for NVFP4 weight-only quantization on all layers (W4A16). + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + w4_nvfp4: configs/ptq/units/w4_nvfp4 + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - $import: w4_nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml new file mode 100644 index 00000000000..12073e14601 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for W4A8 AWQ-lite with INT4 block weights and FP8 inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + int4_per_block: configs/numerics/int4_per_block + +algorithm: awq_lite +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + - $import: int4_per_block + - $import: fp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml new file mode 100644 index 00000000000..428cb659da5 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for W4A8 with MXFP4 block weights and FP8 inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + mxfp4: configs/numerics/mxfp4 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml new file mode 100644 index 00000000000..9b7e541abcc --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizeConfig preset for W4A8 with NVFP4 block-size-32 weights and FP8 inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4_bs32: configs/numerics/nvfp4_bs32 + fp8: configs/numerics/fp8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4_bs32 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/units/README.md b/modelopt_recipes/configs/ptq/units/README.md index b7a7421f9fc..cd738f62626 100644 --- a/modelopt_recipes/configs/ptq/units/README.md +++ b/modelopt_recipes/configs/ptq/units/README.md @@ -19,7 +19,15 @@ recipes (under `general/` or `models/`) or presets (under `presets/`). | `base_disable_all.yaml` | Deny-all entry: disables all quantizers as the first step | | `default_disabled_quantizers.yaml` | Standard exclusions (LM head, routers, BatchNorm, etc.) | | `kv_fp8.yaml` | FP8 E4M3 KV cache quantizer entry; supported on Hopper+ GPUs | +| `kv_fp8_affine.yaml` | FP8 E4M3 affine KV cache quantizer entries; supported on Hopper+ GPUs | | `kv_fp8_cast.yaml` | FP8 E4M3 KV cache with constant amax (skips KV calibration); supported on Hopper+ GPUs | +| `kv_nvfp4.yaml` | NVFP4 KV cache quantizer entry; supported on Blackwell+ GPUs | +| `kv_nvfp4_affine.yaml` | NVFP4 affine KV cache quantizer entries; supported on Blackwell+ GPUs | | `kv_nvfp4_cast.yaml` | NVFP4 KV cache with constant amax (skips KV calibration); supported on Blackwell+ GPUs | +| `kv_nvfp4_rotate.yaml` | NVFP4 rotated KV cache quantizer entries; supported on Blackwell+ GPUs | +| `mamba_moe_disabled_quantizers.yaml` | Shared Mamba-MoE quantizer exclusions | | `w8a8_fp8_fp8.yaml` | FP8 weight + activation quantizer entries (W8A8); supported on Hopper+ GPUs | | `w4a4_nvfp4_nvfp4.yaml` | NVFP4 weight + activation quantizer entries (W4A4); supported on Blackwell+ GPUs | +| `block_sparse_moe_nvfp4.yaml` | NVFP4 W4A4 on `*block_sparse_moe*` weight/input quantizers | +| `experts_nvfp4.yaml` | NVFP4 W4A4 on `*.experts.*` weight/input quantizers | +| `attention_qkv_fp8.yaml` | FP8 E4M3 on attention q/k/v bmm and softmax quantizers | diff --git a/modelopt_recipes/configs/ptq/units/attention_qkv_fp8.yaml b/modelopt_recipes/configs/ptq/units/attention_qkv_fp8.yaml new file mode 100644 index 00000000000..4aa1a7d3240 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/attention_qkv_fp8.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet that enables per-tensor FP8 E4M3 on attention q/k/v +# bmm and softmax quantizers. Pair with a model preset to add bmm2-output entries. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + fp8: configs/numerics/fp8 +--- + - quantizer_name: '*[qkv]_bmm_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*softmax_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/units/base_disable_all.yaml b/modelopt_recipes/configs/ptq/units/base_disable_all.yaml index 9a520ee207f..ee96d00411c 100644 --- a/modelopt_recipes/configs/ptq/units/base_disable_all.yaml +++ b/modelopt_recipes/configs/ptq/units/base_disable_all.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Disable all quantizers by default (deny-all-then-configure pattern). +# QuantizerCfgList snippet that disables every quantizer before selective re-enabling. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgEntry quantizer_name: '*' diff --git a/modelopt_recipes/configs/ptq/units/block_sparse_moe_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/block_sparse_moe_nvfp4.yaml new file mode 100644 index 00000000000..b39bc50d748 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/block_sparse_moe_nvfp4.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet that enables dynamic NVFP4 on weight and input +# quantizers under ``*block_sparse_moe*`` paths. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + nvfp4: configs/numerics/nvfp4 +--- + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*input_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml index 1508f942776..86d5a64c673 100644 --- a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml +++ b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Standard quantizer exclusions: layers that should not be quantized. +# QuantizerCfgList snippet for standard module patterns that should remain unquantized. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig - quantizer_name: '*block_sparse_moe.gate*' diff --git a/modelopt_recipes/configs/ptq/units/experts_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/experts_nvfp4.yaml new file mode 100644 index 00000000000..31c54e57d9c --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/experts_nvfp4.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet that enables dynamic NVFP4 on weight and input +# quantizers under ``*.experts.*`` paths. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + nvfp4: configs/numerics/nvfp4 +--- + - quantizer_name: '*.experts.*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*.experts.*input_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8.yaml index 646be96709f..86156e5e95c 100644 --- a/modelopt_recipes/configs/ptq/units/kv_fp8.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 KV cache quantization. +# QuantizerCfgList snippet that enables FP8 E4M3 KV-cache quantizers. # # This snippet uses multi-document YAML (separated by ---) because it is a # list-valued snippet that also needs to $import another snippet. YAML only diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml new file mode 100644 index 00000000000..5276aff2d48 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet that enables affine FP8 E4M3 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + kv_fp8: configs/ptq/units/kv_fp8 + fp8: configs/numerics/fp8 +--- + - $import: kv_fp8 + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + $import: fp8 + bias: + -2: + -4: + type: static diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml index 64cfbd47bc7..606c969ab37 100644 --- a/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 KV cache quantization with constant amax. +# QuantizerCfgList snippet that enables FP8 E4M3 KV-cache quantizers with constant amax. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml new file mode 100644 index 00000000000..a95b854a0aa --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet that enables NVFP4 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + nvfp4: configs/numerics/nvfp4 +--- + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml new file mode 100644 index 00000000000..2122e8b3431 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet that enables affine NVFP4 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + kv_nvfp4: configs/ptq/units/kv_nvfp4 + nvfp4: configs/numerics/nvfp4 +--- + - $import: kv_nvfp4 + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + $import: nvfp4 + bias: + -2: + -4: + type: static diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml index 3fc5d597aa8..b5658c2ff11 100644 --- a/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 KV cache quantization with constant amax. +# QuantizerCfgList snippet that enables NVFP4 KV-cache quantizers with constant amax. # # The deployment kernel upcasts NVFP4 KV values to FP8 before attention, so the # scale must land in the FP8 range. diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml new file mode 100644 index 00000000000..b117edbf1be --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet that enables rotated NVFP4 KV-cache quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + nvfp4: configs/numerics/nvfp4 +--- + - quantizer_name: '*q_bmm_quantizer' + cfg: + rotate: true + enable: false + - quantizer_name: '*k_bmm_quantizer' + cfg: + $import: nvfp4 + rotate: true + - quantizer_name: '*v_bmm_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml new file mode 100644 index 00000000000..c9b87f8d212 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# QuantizerCfgList snippet with Mamba/MoE-specific exclusion patterns. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig + - quantizer_name: '*fc1_latent_proj*' + enable: false + - quantizer_name: '*fc2_latent_proj*' + enable: false + - quantizer_name: '*q_proj*' + enable: false + - quantizer_name: '*k_proj*' + enable: false + - quantizer_name: '*v_proj*' + enable: false + - quantizer_name: '*o_proj*' + enable: false + - quantizer_name: '*self_attention.linear_qkv*' + enable: false + - quantizer_name: '*self_attention.linear_proj*' + enable: false diff --git a/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml index 033cdf76697..010d81ab621 100644 --- a/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml +++ b/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# W4A4 NVFP4: NVFP4 E2M1 dynamic weight and activation quantizers. +# QuantizerCfgList snippet that enables dynamic NVFP4 on weight and input quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml b/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml index 07db59ff3b0..068f38d1497 100644 --- a/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml +++ b/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# W8A8 FP8: FP8 E4M3 weight and activation quantizers. +# QuantizerCfgList snippet that enables per-tensor FP8 E4M3 on weight and input quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml index 4c6ba99e11f..ea2ac567290 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for W8A8 FP8 E4M3 model quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,8 @@ imports: metadata: recipe_type: ptq - description: FP8 per-tensor weight and activation (W8A8), FP8 KV cache, max calibration. + description: >- + Composes W8A8 FP8 E4M3 model quantization with FP8 KV-cache quantization; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml index f99a716ced5..4e24bf53274 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for W8A8 FP8 E4M3 model quantization with FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,8 +24,8 @@ imports: metadata: recipe_type: ptq description: >- - FP8 per-tensor weight and activation (W8A8), FP8 KV cache with constant amax - (skips KV calibration; amax hardcoded to FP8 E4M3 max 448.0), max calibration. + Composes W8A8 FP8 E4M3 model quantization with FP8 KV-cache cast mode using constant amax; uses + max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml index 63b6d673b94..6a65efef57a 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for dynamic NVFP4 W4A4 model quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 W4A4, FP8 KV cache, max calibration. + description: >- + Composes dynamic NVFP4 W4A4 model quantization with FP8 KV-cache quantization; uses max + calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml index 1504f33d3cc..312cdd16c8d 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for dynamic NVFP4 W4A4 model quantization with FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,8 +24,8 @@ imports: metadata: recipe_type: ptq description: >- - NVFP4 W4A4, FP8 KV cache with constant amax (skips KV calibration; amax - hardcoded to FP8 E4M3 max 448.0), max calibration. + Composes dynamic NVFP4 W4A4 model quantization with FP8 KV-cache cast mode using constant amax; + uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml index 6aabb04a150..6dee51857c8 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for NVFP4 W4A4 model quantization with KV quantizers disabled and GPTQ calibration. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 weight and activation (W4A4), gptq layerwise calibration. + description: >- + Applies NVFP4 W4A4 with static weight scales, dynamic inputs, KV quantizers disabled, and GPTQ + layerwise calibration. quantize: algorithm: method: gptq diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml index d9991e0b9c3..0acdf6050db 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for dynamic NVFP4 W4A4 model quantization with NVFP4 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,10 +24,8 @@ imports: metadata: recipe_type: ptq description: >- - NVFP4 W4A4, NVFP4 KV cache with constant amax (skips KV calibration; amax - hardcoded to FP8 E4M3 max 448.0 — the deployment kernel upcasts NVFP4 KV - values to FP8 before attention, so the scale must land in the FP8 range), - max calibration. + Composes dynamic NVFP4 W4A4 model quantization with NVFP4 KV-cache cast mode using constant + amax; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml index 547cf312863..08864c8a50d 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for expert-only dynamic NVFP4 quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max calibration. + description: >- + Applies dynamic NVFP4 only to expert-layer weight and input quantizers, plus FP8 KV-cache + quantization; uses max calibration. quantize: algorithm: method: max diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml index 5db1666402d..5bf9a36dc31 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for expert-only NVFP4 quantization with MSE weight calibration and FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,7 +24,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), FP8 KV cache with constant amax. + description: >- + Applies static NVFP4 weight scales from MSE FP8-scale sweep and dynamic NVFP4 inputs to expert + layers only, plus FP8 KV-cache cast mode. quantize: algorithm: method: mse diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml index 60cba464e0c..a4cf71a1dbd 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for MLP/MoE-only dynamic NVFP4 quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for all linear layers (W4A4), FP8 KV cache, max calibration. + description: >- + Applies dynamic NVFP4 only to MLP/MoE weight and input quantizers, plus FP8 KV-cache + quantization; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml index 875fb47c9b3..2ea2c0ab13e 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for MLP/MoE-only NVFP4 quantization with MSE weight calibration and FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,7 +24,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for MLP/MoE linear layers (W4A4), FP8 KV cache with constant amax. + description: >- + Applies static NVFP4 weight scales from MSE FP8-scale sweep and dynamic NVFP4 inputs to MLP/MoE + layers, plus FP8 KV-cache cast mode. quantize: algorithm: method: mse diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml index 13c7cac0797..5348e8c7123 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for output-projection and MLP/MoE dynamic NVFP4 quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for all linear layers including output projections, FP8 KV cache, max calibration. + description: >- + Applies dynamic NVFP4 to output-projection and MLP/MoE weight and input quantizers, plus + FP8 KV-cache quantization; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/speculative_decoding/dflash.yaml b/modelopt_recipes/general/speculative_decoding/dflash.yaml index a38b24d05d6..021cccd475d 100644 --- a/modelopt_recipes/general/speculative_decoding/dflash.yaml +++ b/modelopt_recipes/general/speculative_decoding/dflash.yaml @@ -1,5 +1,4 @@ -# Base config for DFlash training. A full modelopt recipe; override fields via -# OmegaConf dotlist on the CLI (e.g. `model.model_name_or_path=...`). +# DFlash speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI. metadata: recipe_type: speculative_dflash diff --git a/modelopt_recipes/general/speculative_decoding/eagle3.yaml b/modelopt_recipes/general/speculative_decoding/eagle3.yaml index 78767ad1ebb..34448182ae2 100644 --- a/modelopt_recipes/general/speculative_decoding/eagle3.yaml +++ b/modelopt_recipes/general/speculative_decoding/eagle3.yaml @@ -1,5 +1,4 @@ -# Base config for EAGLE3 training. A full modelopt recipe; override fields via -# OmegaConf dotlist on the CLI (e.g. `model.model_name_or_path=...`). +# EAGLE3 speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI. metadata: recipe_type: speculative_eagle diff --git a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml index c00aff7d44f..d0adbe00479 100644 --- a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml +++ b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml @@ -13,50 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Model-specific PTQ recipe for Step3.5-Flash NVFP4 MLP/MoE quantization with FP8 KV cache. + +imports: + fp8: configs/numerics/fp8 + nvfp4: configs/numerics/nvfp4 + metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for MoE/MLP projections (W4A4), FP8 KV cache, max calibration. + description: >- + Step3.5-Flash PTQ recipe that enables dynamic NVFP4 on MoE/MLP weight and input quantizers, + enables FP8 KV-cache quantizers, and leaves other quantizers disabled. quantize: algorithm: max quant_cfg: - quantizer_name: '*' enable: false - quantizer_name: '*moe*weight_quantizer' - enable: true cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*moe*input_quantizer' - enable: true cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*mlp*weight_quantizer' - enable: true cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*mlp*input_quantizer' - enable: true cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*[kv]_bmm_quantizer' - enable: true cfg: - num_bits: e4m3 + $import: fp8 - quantizer_name: '*share_expert*' enable: false - quantizer_name: '*moe.gate.*' diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index ce241150a3b..4c4e2d07ded 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -16,6 +16,7 @@ """Unit tests for modelopt.recipe.loader and modelopt.recipe.loader.load_config.""" import re +from importlib.resources import files import pytest @@ -85,6 +86,13 @@ def _write_quantizer_cfg_list(path, body: str): path.write_text(QUANTIZER_CFG_LIST_SCHEMA + body) +def _cfg_to_dict(cfg): + """Dump a QuantizerAttributeConfig (or list of them) to plain dicts for comparison.""" + if isinstance(cfg, list): + return [item.model_dump(exclude_unset=True) for item in cfg] + return cfg.model_dump(exclude_unset=True) + + # --------------------------------------------------------------------------- # Directory-format YAML fixtures # --------------------------------------------------------------------------- @@ -1336,20 +1344,20 @@ def test_import_cross_file_same_name_no_conflict(tmp_path): # --------------------------------------------------------------------------- -_BUILTIN_CONFIG_SNIPPETS = [ - "configs/numerics/fp8", - "configs/numerics/nvfp4", - "configs/numerics/nvfp4_static", - "configs/ptq/units/base_disable_all", - "configs/ptq/units/default_disabled_quantizers", - "configs/ptq/units/kv_fp8", - "configs/ptq/units/kv_fp8_cast", - "configs/ptq/units/kv_nvfp4_cast", - "configs/ptq/units/w4a4_nvfp4_nvfp4", - "configs/ptq/units/w8a8_fp8_fp8", - "configs/ptq/presets/kv/fp8", - "configs/ptq/presets/model/fp8", -] +def _iter_builtin_config_snippets(root): + """Yield built-in config YAML files that declare a modelopt schema.""" + for child in sorted(root.iterdir(), key=lambda path: path.name): + if child.is_dir(): + yield from _iter_builtin_config_snippets(child) + elif child.name.endswith((".yaml", ".yml")) and "modelopt-schema:" in child.read_text( + encoding="utf-8" + ): + yield child + + +_BUILTIN_CONFIG_SNIPPETS = list( + _iter_builtin_config_snippets(files("modelopt_recipes").joinpath("configs")) +) @pytest.mark.parametrize("config_path", _BUILTIN_CONFIG_SNIPPETS) @@ -1428,6 +1436,66 @@ def test_modelopt_schema_comment_validates_after_import_resolution(tmp_path): } +def test_import_dict_snippet_imports_in_union_typed_list_field(tmp_path): + """A bare import can append into QuantizerCfgEntry.cfg's list branch.""" + (tmp_path / "int4.yaml").write_text( + "# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig\n" + "num_bits: 4\n" + "block_sizes:\n" + " -1: 128\n" + " type: static\n" + ) + (tmp_path / "fp8.yaml").write_text( + "# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig\n" + "num_bits: e4m3\n" + ) + config_file = tmp_path / "config.yaml" + config_file.write_text( + f"# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig\n" + f"imports:\n" + f" int4: {tmp_path / 'int4.yaml'}\n" + f" fp8: {tmp_path / 'fp8.yaml'}\n" + f"algorithm: awq_lite\n" + f"quant_cfg:\n" + f" - quantizer_name: '*weight_quantizer'\n" + f" cfg:\n" + f" - $import: int4\n" + f" - $import: fp8\n" + ) + + data = load_config(config_file) + + assert _cfg_to_dict(data["quant_cfg"][0]["cfg"]) == [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": (4, 3)}, + ] + + +def test_import_dict_snippet_in_union_typed_list_field_with_inline_item(tmp_path): + """A dict snippet can be imported as one item inside QuantizerCfgEntry.cfg list.""" + _write_quantizer_attribute( + tmp_path / "int4.yaml", + "num_bits: 4\nblock_sizes:\n -1: 128\n type: static\n", + ) + config_file = tmp_path / "config.yaml" + config_file.write_text( + f"# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig\n" + f"imports:\n" + f" int4: {tmp_path / 'int4.yaml'}\n" + f"algorithm: awq_lite\n" + f"quant_cfg:\n" + f" - quantizer_name: '*weight_quantizer'\n" + f" cfg:\n" + f" - $import: int4\n" + f" - num_bits: e4m3\n" + ) + data = load_config(config_file) + assert _cfg_to_dict(data["quant_cfg"][0]["cfg"]) == [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": (4, 3)}, + ] + + # --------------------------------------------------------------------------- # Coverage: _load_raw_config edge cases # ---------------------------------------------------------------------------