diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4b0bb02682e..b97284f2831 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -48,6 +48,8 @@ Changelog ``ModelOptArgParser`` adds ``--config`` YAML support with CLI overrides and auto-generates ``ARGUMENTS.md`` from dataclass definitions. Dataset blending (``configs/dataset/blend.yaml``) supports HuggingFace datasets, local JSON/JSONL/Parquet files, and weighted multi-source blends. The legacy FSDP1 accelerate config is removed; ``llm_qat`` now documents FSDP2, DeepSpeed, and DDP backends. +- The PTQ example scripts ``examples/llm_ptq/hf_ptq.py``, ``examples/llm_ptq/multinode_ptq.py`` and ``examples/megatron_bridge/quantize.py`` now derive their ``--qformat`` / ``--kv_cache_qformat`` (``--quant_cfg`` / ``--kv_cache_quant`` for Megatron-Bridge) CLI vocabularies by discovering the YAML presets under ``modelopt_recipes/configs/ptq/presets/{model,kv}/`` rather than carrying hardcoded ``QUANT_CFG_CHOICES`` / ``KV_QUANT_CFG_CHOICES`` tables. The discovery helper, alias table and ready-built ``QUANT_CFG_CHOICES`` / ``KV_QUANT_CFG_CHOICES`` mappings now live in ``modelopt.recipe.presets`` and are shared by all three scripts. Presets are loaded eagerly into a plain dict at import. Adding a new preset YAML makes it available on the CLI of all three with no script change — note this means each script now accepts every preset under those directories, not just a previously curated subset. All previously-supported short names (``int8_sq``, ``nvfp4_awq``, ``fp8_pb_wo``, ``nvfp4_mse``, ``w4a8_awq``, ``nvfp4_local_hessian``, ``fp8_pc_pt``, ``int8_wo``) keep working via a small deprecation alias table; new formats should be exposed as preset YAMLs (or, longer term, as full ``--recipe`` recipes). +- Add ``configs/ptq/presets/kv/fp8_cast.yaml`` and ``configs/ptq/presets/kv/nvfp4_cast.yaml``, promoting ``fp8_cast`` / ``nvfp4_cast`` to first-class KV presets composed from the existing ``kv_fp8_cast`` / ``kv_nvfp4_cast`` unit fragments. The previous runtime ``use_constant_amax`` post-edit in ``hf_ptq.py`` is removed; ``use_constant_amax: true`` now lives in the YAML and is therefore authoritative. **Custom (out-of-tree) recipes that target a cast KV format must set ``use_constant_amax: true`` themselves on the ``[kv]_bmm_quantizer`` config** — in-tree recipes already do via the ``kv_*_cast`` units. **Bug Fixes** diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 36059c09f2c..3fe3f3ceb03 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -56,6 +56,12 @@ import modelopt.torch.quantization as mtq import modelopt.torch.sparsity as mts from modelopt.recipe import ModelOptPTQRecipe, load_recipe +from modelopt.recipe.presets import ( + KV_CACHE_NONE, + KV_QUANT_CFG_CHOICES, + QFORMAT_ALIASES, + QUANT_CFG_CHOICES, +) from modelopt.torch.export import ( export_hf_checkpoint, export_hf_vllm_fq_checkpoint, @@ -86,56 +92,67 @@ RAND_SEED = 1234 -def _set_kv_cache_constant_amax(quant_cfg: list) -> None: - """Set use_constant_amax on KV cache quantizers. +def _kv_cfg_uses_constant_amax(kv_quant_cfg: list[dict[str, Any]]) -> bool: + """Return True if this KV cfg pins ``use_constant_amax`` on the bmm quantizer. - Creates a new dict for the KV bmm quantizer config to avoid mutating shared references. + Cast-style KV presets (e.g. ``fp8_cast`` / ``nvfp4_cast``) set + ``use_constant_amax: true`` on the ``*[kv]_bmm_quantizer`` entry; that flag + means there is no data-driven calibration to run, so callers should skip + the KV-only calibration pass. Detect the property from the YAML contents + rather than from the preset name so new cast-style presets work + automatically. """ - for i, entry in enumerate(quant_cfg): + for entry in kv_quant_cfg: if entry.get("quantizer_name") != "*[kv]_bmm_quantizer": continue cfg = entry.get("cfg") or {} - assert isinstance(cfg, dict) - quant_cfg[i] = {**entry, "cfg": {**cfg, "use_constant_amax": True}} - break - - -QUANT_CFG_CHOICES: dict[str, dict[str, Any]] = { - "int8": mtq.INT8_DEFAULT_CFG, - "int8_sq": mtq.INT8_SMOOTHQUANT_CFG, - "int8_wo": mtq.INT8_WEIGHT_ONLY_CFG, - "fp8": mtq.FP8_DEFAULT_CFG, - "int4_awq": mtq.INT4_AWQ_CFG, - "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG, - "nvfp4": mtq.NVFP4_DEFAULT_CFG, - "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG, - "nvfp4_mse": mtq.NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG, - "fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG, - "fp8_pc_pt": mtq.FP8_PER_CHANNEL_PER_TOKEN_CFG, - "w4a8_nvfp4_fp8": mtq.W4A8_NVFP4_FP8_CFG, - "w4a16_nvfp4": mtq.W4A16_NVFP4_CFG, - "w4a8_mxfp4_fp8": mtq.W4A8_MXFP4_FP8_CFG, - "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG, - "nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG, - "nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG, - "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG, - "mxfp8": mtq.MXFP8_DEFAULT_CFG, - "nvfp4_local_hessian": mtq.NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG, -} - -KV_QUANT_CFG_CHOICES = { - "none": "none", - "fp8_cast": "FP8_KV_CFG", - "fp8": "FP8_KV_CFG", - "fp8_affine": "FP8_AFFINE_KV_CFG", - "nvfp4_cast": "NVFP4_KV_CFG", - "nvfp4": "NVFP4_KV_CFG", - "nvfp4_affine": "NVFP4_AFFINE_KV_CFG", - "nvfp4_rotate": "NVFP4_KV_ROTATE_CFG", -} - -# Formats that use use_constant_amax (no calibration needed). -_KV_CAST_FORMATS = {"fp8_cast", "nvfp4_cast"} + return bool(cfg.get("use_constant_amax")) + return False + + +# Formats supported by mtq.auto_quantize unified-checkpoint export. +# +# This stays hardcoded — and intentionally not derived from the preset directory — +# because auto_quantize compatibility is a property of the export path (the unified +# HF checkpoint writer, TRT-LLM consumer constraints, layer-wise mixing rules), not +# of the YAML itself. A preset can exist and be valid for plain PTQ while not being +# safe to mix into an auto_quantize search. Update this set when adding/removing a +# format from auto_quantize support. +# +# NOTE: auto_quantize is being refactored/reimplemented; this table and the +# _canonical_qformat helper below are expected to be removed in the near future, so +# deliberately not invested in deriving them from the presets. +_AUTO_QUANTIZE_QFORMATS: frozenset[str] = frozenset( + { + "fp8", + "int8_smoothquant", + "int8_weight_only", + "int4_awq", + "nvfp4", + "nvfp4_awq_lite", + "nvfp4_w4a4_weight_mse_fp8_sweep", + "w4a8_awq_beta", + "fp8_2d_blockwise_weight_only", + "w4a8_mxfp4_fp8", + "nvfp4_mlp_only", + "nvfp4_experts_only", + "nvfp4_omlp_only", + "nvfp4_w4a4_weight_local_hessian", + "mxfp8", + } +) + + +def _canonical_qformat(name: str) -> str: + """Resolve a user-provided qformat token to its canonical preset basename. + + Lets membership checks (e.g. against :data:`_AUTO_QUANTIZE_QFORMATS`) accept + either the short alias (``int8_sq``) or the canonical YAML basename + (``int8_smoothquant``). Unknown tokens pass through unchanged so the existing + error paths still fire. + """ + return QFORMAT_ALIASES.get(name, name) + mto.enable_huggingface_checkpointing() @@ -311,27 +328,11 @@ def auto_quantize( qformat_list = args.qformat.split(",") assert qformat_list, "No quantization formats provided" - # Check if all provided quantization formats are supported + # Check if all provided quantization formats are supported. Canonicalize first so + # callers may pass either the short alias (``int8_sq``) or the canonical YAML + # basename (``int8_smoothquant``). assert all( - qformat - in [ - "fp8", - "int8_sq", - "int8_wo", - "int4_awq", - "nvfp4", - "nvfp4_awq", - "nvfp4_mse", - "w4a8_awq", - "fp8_pb_wo", - "w4a8_mxfp4_fp8", - "nvfp4_mlp_only", - "nvfp4_experts_only", - "nvfp4_omlp_only", - "nvfp4_local_hessian", - "mxfp8", - ] - for qformat in qformat_list + _canonical_qformat(qformat) in _AUTO_QUANTIZE_QFORMATS for qformat in qformat_list ), "One or more quantization formats provided are not supported for unified checkpoint export" # When language_model is a base text model without lm_head (e.g. Gemma4TextModel), @@ -417,21 +418,16 @@ def forward_step(model, batch): calibrate_loop = create_forward_loop(dataloader=calib_dataloader) # We need to explicitly set up KV cache quantization after auto_quantize - enable_quant_kv_cache = args.kv_cache_qformat != "none" + enable_quant_kv_cache = args.kv_cache_qformat != KV_CACHE_NONE print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization") if enable_quant_kv_cache: - kv_cache_quant_cfg = copy.deepcopy( - getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"] - ) + kv_cache_quant_cfg = copy.deepcopy(KV_QUANT_CFG_CHOICES[args.kv_cache_qformat]["quant_cfg"]) kv_cache_quant_cfg = [ e for e in kv_cache_quant_cfg if e["quantizer_name"] != "*" ] # keep other quantizers from auto_quantize - if args.kv_cache_qformat in _KV_CAST_FORMATS: - _set_kv_cache_constant_amax(kv_cache_quant_cfg) - mtq.set_quantizer_by_cfg(language_model, quant_cfg=kv_cache_quant_cfg) - if args.kv_cache_qformat not in _KV_CAST_FORMATS: + if not _kv_cfg_uses_constant_amax(kv_cache_quant_cfg): # Calibrate only the KV cache quantizers; disable all others. with mtq.set_quantizer_by_cfg_context( language_model, @@ -455,21 +451,14 @@ def load_model(args: argparse.Namespace): ) else: assert args.qformat in QUANT_CFG_CHOICES, ( - f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" + f"Quantization format is not supported for low memory mode. Supported formats: {list(QUANT_CFG_CHOICES)}" ) quant_cfg = QUANT_CFG_CHOICES[args.qformat] - if args.kv_cache_qformat != "none": + if args.kv_cache_qformat != KV_CACHE_NONE: quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant( quant_cfg, - getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"], + KV_QUANT_CFG_CHOICES[args.kv_cache_qformat]["quant_cfg"], ) - # Mirror the use_constant_amax logic from quantize_main so that init_quantized_weights - # builds the KV quantizers with use_constant_amax already set. In calibration_only mode - # mtq.calibrate() does not re-apply quant_cfg, so this must happen before - # init_quantized_weights runs. - if args.kv_cache_qformat in _KV_CAST_FORMATS: - quant_cfg = copy.deepcopy(quant_cfg) - _set_kv_cache_constant_amax(quant_cfg["quant_cfg"]) # Do not use real quant GEMM so the calibration can be more accurate. with init_quantized_weights( @@ -1103,7 +1092,7 @@ def _is_layerwise(obj): ) assert args.qformat in QUANT_CFG_CHOICES, ( - f"Unsupported quantization format: {args.qformat}, choices are: {list(QUANT_CFG_CHOICES.keys())}" + f"Unsupported quantization format: {args.qformat}, choices are: {list(QUANT_CFG_CHOICES)}" ) quant_cfg = QUANT_CFG_CHOICES[args.qformat] @@ -1113,14 +1102,14 @@ def _is_layerwise(obj): args.moe_calib_experts_ratio, ) - enable_quant_kv_cache = args.kv_cache_qformat != "none" + enable_quant_kv_cache = args.kv_cache_qformat != KV_CACHE_NONE print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization") # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer. if enable_quant_kv_cache: quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant( quant_cfg, - getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"], + KV_QUANT_CFG_CHOICES[args.kv_cache_qformat]["quant_cfg"], ) # Exclude MTP layers from quantization if detected (e.g., GLM-4.7's layer 92). @@ -1135,14 +1124,6 @@ def _is_layerwise(obj): quant_cfg["quant_cfg"].append({"quantizer_name": pattern, "enable": False}) print(f"Excluding MTP layer from quantization: {pattern}") - # Use constant amax for KV quantizers when a cast format is selected. - # Recipes are authoritative for KV cache config (including use_constant_amax), - # so skip this post-hoc override when --recipe is used; rely on the YAML instead - # (see modelopt_recipes/general/ptq/*_cast_kv.yaml). - if args.recipe is None and args.kv_cache_qformat in _KV_CAST_FORMATS: - quant_cfg = copy.deepcopy(quant_cfg) - _set_kv_cache_constant_amax(quant_cfg["quant_cfg"]) - if needs_checkpoint_path_update(quant_cfg): quant_cfg = resolve_checkpoint_dir(quant_cfg, args.pyt_ckpt_path) print( @@ -1293,12 +1274,12 @@ def parse_args() -> argparse.Namespace: "--kv_cache_qformat", required=False, default="fp8_cast", - choices=KV_QUANT_CFG_CHOICES.keys(), + choices=[KV_CACHE_NONE, *KV_QUANT_CFG_CHOICES], help=( "Specify KV cache quantization format. Default: fp8_cast. " - "Formats ending in '_cast' (fp8_cast, nvfp4_cast) set the amax to FP8 range " - "without data-driven calibration. " - "Other formats (fp8, nvfp4, etc.) use data-driven calibration. " + "Formats whose preset pins use_constant_amax on the KV bmm quantizer " + "(e.g. fp8_cast, nvfp4_cast) set the amax to FP8 range without data-driven " + "calibration; all other formats (fp8, nvfp4, ...) use data-driven calibration. " "Ignored when --recipe is given: the recipe YAML is authoritative for KV " "cache config (use the *_cast_kv.yaml recipes for the cast variants)." ), @@ -1475,6 +1456,16 @@ def parse_args() -> argparse.Namespace: if args.specdec_offline_dataset is not None and args.low_memory_mode: parser.error("--specdec_offline_dataset is not compatible with --low_memory_mode.") + # The low-memory loader pre-instruments quantizers from --qformat/--kv_cache_qformat + # via init_quantized_weights(), so it cannot honor a --recipe (which is authoritative + # for the quant layout in quantize_main). Reject the combination rather than silently + # instrumenting a layout that diverges from the recipe. + if args.low_memory_mode and args.recipe is not None: + parser.error( + "--low_memory_mode does not yet support --recipe; the low-memory loader still " + "initializes quantizers from --qformat/--kv_cache_qformat." + ) + return args diff --git a/examples/llm_ptq/multinode_ptq.py b/examples/llm_ptq/multinode_ptq.py index 07cad24ee5e..12e6c04e535 100644 --- a/examples/llm_ptq/multinode_ptq.py +++ b/examples/llm_ptq/multinode_ptq.py @@ -22,7 +22,6 @@ import time import warnings from pathlib import Path -from typing import Any import numpy as np import torch @@ -34,6 +33,7 @@ import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq +from modelopt.recipe.presets import KV_CACHE_NONE, KV_QUANT_CFG_CHOICES, QUANT_CFG_CHOICES from modelopt.torch.export import get_model_type from modelopt.torch.export.convert_hf_config import convert_hf_quant_config_format from modelopt.torch.export.unified_export_hf import _export_transformers_checkpoint @@ -44,25 +44,6 @@ # Constants RAND_SEED = 1234 -QUANT_CFG_CHOICES: dict[str, dict[str, Any]] = { - "int8": mtq.INT8_DEFAULT_CFG, - "int4_awq": mtq.INT4_AWQ_CFG, - "fp8": mtq.FP8_DEFAULT_CFG, - "nvfp4": mtq.NVFP4_DEFAULT_CFG, - "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG, - "w4a8_mxfp4_fp8": mtq.W4A8_MXFP4_FP8_CFG, - "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG, - "nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG, - "nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG, -} - -KV_QUANT_CFG_CHOICES = { - "none": "none", - "fp8": "FP8_KV_CFG", - "nvfp4": "NVFP4_KV_CFG", - "nvfp4_affine": "NVFP4_AFFINE_KV_CFG", -} - # Enable HuggingFace checkpointing mto.enable_huggingface_checkpointing() @@ -80,13 +61,13 @@ def parse_args(): parser.add_argument( "--qformat", default="fp8", - choices=QUANT_CFG_CHOICES.keys(), + choices=list(QUANT_CFG_CHOICES), help="Quantization format", ) parser.add_argument( "--kv_cache_qformat", default="fp8", - choices=list(KV_QUANT_CFG_CHOICES.keys()), + choices=[KV_CACHE_NONE, *KV_QUANT_CFG_CHOICES], help="KV cache quantization format", ) parser.add_argument( @@ -280,7 +261,7 @@ def main(args): # Validate quantization format if args.qformat not in QUANT_CFG_CHOICES: raise ValueError( - f"Quantization format {args.qformat} not supported. Choose from: {QUANT_CFG_CHOICES.keys()}" + f"Quantization format {args.qformat} not supported. Choose from: {list(QUANT_CFG_CHOICES)}" ) # Set random seeds @@ -334,14 +315,14 @@ def main(args): args.awq_block_size, ) - enable_quant_kv_cache = args.kv_cache_qformat != "none" + enable_quant_kv_cache = args.kv_cache_qformat != KV_CACHE_NONE print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization") # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer. if enable_quant_kv_cache: quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant( quant_cfg, - getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"], + KV_QUANT_CFG_CHOICES[args.kv_cache_qformat]["quant_cfg"], ) # Quantize the model diff --git a/examples/megatron_bridge/quantize.py b/examples/megatron_bridge/quantize.py index fff16518d5f..078639f3699 100644 --- a/examples/megatron_bridge/quantize.py +++ b/examples/megatron_bridge/quantize.py @@ -60,31 +60,16 @@ import modelopt.torch.quantization as mtq import modelopt.torch.utils.distributed as dist from modelopt.recipe import ModelOptPTQRecipe, load_recipe +from modelopt.recipe.presets import KV_CACHE_NONE, KV_QUANT_CFG_CHOICES, QUANT_CFG_CHOICES from modelopt.torch.utils import print_args, print_rank_0, warn_rank_0 from modelopt.torch.utils.plugins.mbridge import load_mbridge_model_from_hf from modelopt.torch.utils.plugins.megatron_calibration import get_megatron_calibration_forward_loop from modelopt.torch.utils.plugins.megatron_generate import megatron_generate -# Curated short-name aliases for the most common quantization configs. Any other config exposed by -# ``mtq.config.choices`` (e.g. ``FP8_DEFAULT_CFG``) can also be passed by its full name. -QUANT_CFG_CHOICES = { - "int8": mtq.INT8_DEFAULT_CFG, - "int8_sq": mtq.INT8_SMOOTHQUANT_CFG, - "fp8": mtq.FP8_DEFAULT_CFG, - "fp8_blockwise": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG, - "int4_awq": mtq.INT4_AWQ_CFG, - "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG, - "nvfp4": mtq.NVFP4_DEFAULT_CFG, - "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG, -} - -# KV-cache quantization configs (applied on top of the weight/activation quant config). -KV_QUANT_CFG_CHOICES = { - "none": "none", - "fp8": "FP8_KV_CFG", - "nvfp4": "NVFP4_KV_CFG", - "nvfp4_affine": "NVFP4_AFFINE_KV_CFG", -} +# The --quant_cfg / --kv_cache_quant CLI vocabularies are discovered from the preset +# YAMLs (shared with the llm_ptq examples via modelopt.recipe.presets). --quant_cfg +# additionally accepts any full config name from ``mtq.config.choices`` (e.g. +# ``FP8_DEFAULT_CFG``); see get_quant_config below. # TODO: Add AutoQuantize (mtq.auto_quantize) support to automatically search a per-layer mix of # quantization formats that meets a target compression / accuracy constraint, instead of applying a @@ -123,7 +108,7 @@ def get_args() -> argparse.Namespace: type=str, default="fp8", help=( - f"Quantization config. Short aliases: {', '.join(QUANT_CFG_CHOICES)}. " + f"Quantization config. Preset names / short aliases: {', '.join(QUANT_CFG_CHOICES)}. " "You can also pass any full config name exposed by modelopt (e.g. FP8_DEFAULT_CFG). " "Ignored when --recipe is set." ), @@ -131,8 +116,8 @@ def get_args() -> argparse.Namespace: parser.add_argument( "--kv_cache_quant", type=str, - default="none", - choices=list(KV_QUANT_CFG_CHOICES), + default=KV_CACHE_NONE, + choices=[KV_CACHE_NONE, *KV_QUANT_CFG_CHOICES], help="KV-cache quantization config to apply on top of --quant_cfg. Ignored when --recipe is set.", ) parser.add_argument( @@ -205,7 +190,7 @@ def get_quant_config(args: argparse.Namespace) -> dict: # customizations below are skipped. print_rank_0(f"Using recipe {args.recipe} for quantization") if ( - args.kv_cache_quant != "none" + args.kv_cache_quant != KV_CACHE_NONE or args.weight_only or args.moe_calib_experts_ratio is not None ): @@ -226,20 +211,21 @@ def get_quant_config(args: argparse.Namespace) -> dict: mtq_config = getattr(mtq, args.quant_cfg) else: raise ValueError( - f"Unsupported --quant_cfg '{args.quant_cfg}'. Choose one of the short aliases " + f"Unsupported --quant_cfg '{args.quant_cfg}'. Choose a preset name / short alias " f"({', '.join(QUANT_CFG_CHOICES)}) or a full config name from {mtq.config.choices}." ) - # Deepcopy so we don't mutate the shared module-level config, and normalize the inner quant_cfg - # to the list format so we can safely append customizations below. + # Deepcopy so we don't mutate a shared module-level config (the ``mtq.config.choices`` + # full-name branch returns one; QUANT_CFG_CHOICES already hands back a fresh copy), and + # normalize the inner quant_cfg to the list format so we can safely append customizations below. mtq_config = copy.deepcopy(mtq_config) mtq_config["quant_cfg"] = mtq.normalize_quant_cfg_list(mtq_config["quant_cfg"]) if args.weight_only: mtq_config["quant_cfg"].append({"quantizer_name": "*input_quantizer", "enable": False}) - if args.kv_cache_quant != "none": - kv_cache_quant_cfg = getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_quant])["quant_cfg"] + if args.kv_cache_quant != KV_CACHE_NONE: + kv_cache_quant_cfg = KV_QUANT_CFG_CHOICES[args.kv_cache_quant]["quant_cfg"] mtq_config = mtq.utils.update_quant_cfg_with_kv_cache_quant(mtq_config, kv_cache_quant_cfg) # For MoE models, optionally calibrate only a fraction of experts per forward pass for speed. diff --git a/modelopt/recipe/presets.py b/modelopt/recipe/presets.py new file mode 100644 index 00000000000..46b55287074 --- /dev/null +++ b/modelopt/recipe/presets.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PTQ quant-config preset discovery shared by the PTQ example scripts. + +The example PTQ entry points (``examples/llm_ptq/hf_ptq.py``, +``examples/llm_ptq/multinode_ptq.py``, ``examples/megatron_bridge/quantize.py``) +expose a ``--qformat`` / ``--kv_cache_qformat`` (``--quant_cfg`` / +``--kv_cache_quant`` for Megatron-Bridge) CLI vocabulary. Rather than hardcoding a +name → config table in each script, the vocabulary is discovered by listing the +YAML presets shipped under ``modelopt_recipes/configs/ptq/presets/{model,kv}/``: +every ``*.yaml`` basename is a valid format name, and the directory listing is the +single source of truth. Adding a preset YAML exposes it on all three CLIs with no +code change. + +:data:`QUANT_CFG_CHOICES` and :data:`KV_QUANT_CFG_CHOICES` are the ready-to-use +mappings; :func:`load_quant_cfg_choices` builds equivalent mappings for custom +preset directories. Configs are loaded eagerly into plain dicts at import; callers +that mutate a returned config must deepcopy it first (this mirrors how the +``mtq.*_CFG`` module constants — themselves eagerly-loaded shared dicts — are used). +""" + +from collections.abc import Mapping +from typing import Any + +from modelopt.torch.opt.config_loader import BUILTIN_CONFIG_ROOT, load_config +from modelopt.torch.quantization.config import QuantizeConfig + +__all__ = [ + "KV_CACHE_NONE", + "KV_QUANT_CFG_CHOICES", + "KV_QUANT_PRESET_DIR", + "MODEL_QUANT_PRESET_DIR", + "QFORMAT_ALIASES", + "QUANT_CFG_CHOICES", + "load_quant_cfg_choices", +] + +# Preset directories (relative to ``modelopt_recipes/``) that back the CLI vocabulary. +# +# Prefer NOT to add new YAMLs to these directories: the long-term direction is to +# retire ``--qformat`` / ``--kv_cache_qformat`` in favour of ``--recipe`` (a full PTQ +# recipe; see ``modelopt_recipes/general/ptq/`` and :mod:`modelopt.recipe`). New +# quantization configurations should be authored as recipes, not as preset entries. +MODEL_QUANT_PRESET_DIR = "configs/ptq/presets/model" +KV_QUANT_PRESET_DIR = "configs/ptq/presets/kv" + +# Sentinel ``--kv_cache_qformat`` value meaning "no KV cache quantization". Handled by +# the scripts outside the discovered presets; guarded below against a ``none.yaml`` clash. +KV_CACHE_NONE = "none" + +# Backward-compat short names → canonical preset basename. These aliases predate the +# YAML-driven discovery and remain accepted so existing scripts/docs keep working. +# +# DO NOT add new entries here. New quantization formats must be exposed via their YAML +# basename under ``modelopt_recipes/configs/ptq/presets/model/`` — the directory listing +# is the canonical CLI vocabulary. This table exists solely to keep pre-existing short +# names working through deprecation and should only ever shrink. +QFORMAT_ALIASES: dict[str, str] = { + "int8_sq": "int8_smoothquant", + "int8_wo": "int8_weight_only", + "w4a8_awq": "w4a8_awq_beta", + "nvfp4_awq": "nvfp4_awq_lite", + "nvfp4_mse": "nvfp4_w4a4_weight_mse_fp8_sweep", + "nvfp4_local_hessian": "nvfp4_w4a4_weight_local_hessian", + "fp8_pb_wo": "fp8_2d_blockwise_weight_only", + "fp8_pc_pt": "fp8_per_channel_per_token", +} + + +def load_quant_cfg_choices( + subdir: str, aliases: Mapping[str, str] | None = None +) -> dict[str, dict[str, Any]]: + """Build a ``{qformat_name: quant_cfg_dict}`` mapping from preset YAMLs. + + Every ``*.yaml`` under ``modelopt_recipes//`` is loaded and keyed by its + basename — the directory listing is the CLI vocabulary. ``aliases`` adds extra + short names pointing at canonical basenames; a stale alias raises here (at load + time) rather than failing silently at lookup time. + + Args: + subdir: Preset directory relative to ``modelopt_recipes/`` (e.g. + :data:`MODEL_QUANT_PRESET_DIR`). + aliases: Optional ``short_name -> canonical_basename`` deprecation map. + + Returns: + Mapping from format name (preset basename or alias) to the loaded + ``QuantizeConfig`` dict. Configs are loaded eagerly; callers that mutate a + returned config must deepcopy it first. + """ + aliases = aliases or {} + basenames = sorted( + entry.name.rsplit(".", 1)[0] + for entry in BUILTIN_CONFIG_ROOT.joinpath(subdir).iterdir() + if entry.name.endswith((".yaml", ".yml")) + ) + choices: dict[str, dict[str, Any]] = { + name: load_config(f"{subdir}/{name}", schema_type=QuantizeConfig).model_dump( + exclude_unset=True + ) + for name in basenames + } + for alias, target in sorted(aliases.items()): + if target not in choices: + raise ValueError( + f"Alias {alias!r} points at preset {target!r} which is not present " + f"under modelopt_recipes/{subdir}/." + ) + choices[alias] = choices[target] + return choices + + +QUANT_CFG_CHOICES: dict[str, dict[str, Any]] = load_quant_cfg_choices( + MODEL_QUANT_PRESET_DIR, QFORMAT_ALIASES +) +KV_QUANT_CFG_CHOICES: dict[str, dict[str, Any]] = load_quant_cfg_choices(KV_QUANT_PRESET_DIR) + +# Guard against a future ``none.yaml`` (or alias) colliding with the disable sentinel: +# the runtime branch on ``!= KV_CACHE_NONE`` would otherwise become ambiguous. +assert KV_CACHE_NONE not in KV_QUANT_CFG_CHOICES, ( + f"KV_CACHE_NONE sentinel {KV_CACHE_NONE!r} collides with a KV preset; rename the preset." +) diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8_cast.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8_cast.yaml new file mode 100644 index 00000000000..e689a17ad4e --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8_cast.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Partial QuantizeConfig that enables FP8 E4M3 KV-cache quantizers with +# ``use_constant_amax`` (no data-driven calibration required). + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + kv_fp8_cast: configs/ptq/units/kv_fp8_cast + +quant_cfg: + - $import: kv_fp8_cast diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_cast.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_cast.yaml new file mode 100644 index 00000000000..665e20fe4fa --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_cast.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Partial QuantizeConfig that enables NVFP4 KV-cache quantizers with +# ``use_constant_amax`` (no data-driven calibration required). + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + kv_nvfp4_cast: configs/ptq/units/kv_nvfp4_cast + +quant_cfg: + - $import: kv_nvfp4_cast diff --git a/tests/unit/recipe/test_presets.py b/tests/unit/recipe/test_presets.py new file mode 100644 index 00000000000..1f2c0d3ea0e --- /dev/null +++ b/tests/unit/recipe/test_presets.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Smoke tests for ``modelopt.recipe.presets`` preset discovery. + +Guards the eager import-time load shared by the PTQ example scripts: every preset +under the model/KV dirs must load into a usable ``quant_cfg`` dict, deprecation +aliases must resolve to their canonical preset, and the KV ``none`` sentinel must +not collide with a discovered preset. A single malformed preset YAML would +otherwise break ``import modelopt.recipe.presets`` (and every PTQ example). +""" + +import pytest + +from modelopt.recipe import presets +from modelopt.torch.opt.config_loader import BUILTIN_CONFIG_ROOT + + +def _yaml_basenames(subdir: str) -> set[str]: + return { + entry.name.rsplit(".", 1)[0] + for entry in BUILTIN_CONFIG_ROOT.joinpath(subdir).iterdir() + if entry.name.endswith((".yaml", ".yml")) + } + + +@pytest.mark.parametrize( + ("choices", "preset_dir"), + [ + (presets.QUANT_CFG_CHOICES, presets.MODEL_QUANT_PRESET_DIR), + (presets.KV_QUANT_CFG_CHOICES, presets.KV_QUANT_PRESET_DIR), + ], + ids=["model", "kv"], +) +def test_every_discovered_preset_loads(choices, preset_dir): + # Configs load eagerly at import, so a malformed preset would already have raised. + # Assert discovery is non-empty, covers every YAML on disk, and that each resolved + # entry is a usable quant_cfg dict. + basenames = _yaml_basenames(preset_dir) + assert basenames, f"no preset YAMLs discovered under {preset_dir}" + assert basenames <= set(choices), "a preset YAML is missing from the discovered choices" + for name, cfg in choices.items(): + assert isinstance(cfg, dict), f"{name} did not resolve to a dict" + assert "quant_cfg" in cfg, f"{name} is missing the 'quant_cfg' key" + + +def test_aliases_resolve_to_their_canonical_preset(): + for alias, target in presets.QFORMAT_ALIASES.items(): + assert alias in presets.QUANT_CFG_CHOICES, f"alias {alias!r} not exposed" + assert target in presets.QUANT_CFG_CHOICES, f"alias target {target!r} missing" + assert presets.QUANT_CFG_CHOICES[alias] == presets.QUANT_CFG_CHOICES[target] + + +def test_kv_none_sentinel_is_not_a_discovered_preset(): + # The scripts branch on ``kv_cache_qformat != KV_CACHE_NONE``; a real preset named + # "none" would make that branch ambiguous. + assert presets.KV_CACHE_NONE not in presets.KV_QUANT_CFG_CHOICES + + +def test_load_quant_cfg_choices_rejects_stale_alias(): + with pytest.raises(ValueError, match="does-not-exist"): + presets.load_quant_cfg_choices( + presets.MODEL_QUANT_PRESET_DIR, {"bad_alias": "does-not-exist"} + )