NVIDIA · jenchen13 · May 29, 2026 · May 22, 2026 · May 22, 2026 · May 26, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -33,10 +33,14 @@ Changelog
 - Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. vLLM deployment support is in progress.
 - Add ``DATASET_COMBOS`` to ``modelopt.torch.utils.dataset_utils`` — single ``--dataset`` tokens that fan out to multiple registered datasets; per-entry ``num_samples`` is split evenly across the members. Initial combos: ``cnn_nemotron_v2_mix`` (``cnn_dailymail`` + ``nemotron-post-training-dataset-v2``, used by ``hf_ptq.py`` when no ``--dataset`` is provided) and ``nemotron-post-training-v3`` (the seven ``nvidia/Nemotron-*`` SFT datasets added in #1498, mirroring the `nemotron-post-training-v3 collection <https://huggingface.co/collections/nvidia/nemotron-post-training-v3>`_). Combo names are listed by ``get_supported_datasets()`` and surfaced in ``--dataset`` help. ``get_dataset_dataloader`` rejects inputs that mix a combo with one of its member datasets (e.g. ``cnn_dailymail,cnn_nemotron_v2_mix``) to avoid double-sampling, and ``get_dataset_samples`` rejects combo names so callers route through the dataloader. ``hf_ptq.py`` default ``--calib_size`` is bumped from ``512`` to ``1024`` so the total calibration sample count under the new default combo matches the previous two-dataset fallback.
 - The ``nemotron-sft-agentic-v2`` registered dataset (added in #1498) now uses only the ``search`` split. The previously configured ``interactive_agent`` and ``tool_calling`` splits contain content-level defects (heterogeneous schema and a malformed JSON row, respectively) that cause pyarrow's streaming JSON reader to fail deterministically.
+- Support Megatron-Core checkpoint restore and export for MSE ``NVFP4StaticQuantizer``.
+- Add mixed-precision FP8 + NVFP4 export for Megatron-Core: per-layer ``quant_algo`` recorded under ``quantized_layers`` in ``hf_quant_config.json``, PP-aware ``kv_cache_dtype`` gather, fused-QKV exclude split into per-HF-name ``q/k/v_proj`` entries.
+- Add Nemotron-3-Super-120B-A12B PTQ recipes ``modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4.yaml`` (MSE-mixed) and ``super-nvfp4-max-calib.yaml`` (max-calib mixed): NVFP4 W4A4 routed experts + FP8 per-tensor shared experts / Mamba in/out_proj + FP8 KV cache.
 - Add quantized ``nn.Embedding`` support. ``nn.Embedding`` is now registered in ``QuantModuleRegistry`` and exposes ``weight_quantizer`` (embedding table), ``output_quantizer`` (lookup activations), and a permanently disabled ``input_quantizer`` placeholder — embedding inputs are integer indices and cannot be fake-quantized, so direct ``enable*()`` calls raise. ``export_hf_checkpoint`` packs quantized embedding weights alongside Linear layers. Embedding quantizers are opt-in (``parent_class: nn.Embedding`` disabled by default).
 
 **Bug Fixes**
 
+- In Megatron-Core only do EP amax sync for routed expert weights if ``sync_expert_weight_amax=True``. Previously EP amax sync would sync routed expert weights across EP ranks even when ``sync_expert_weight_amax`` was False.
 - Fix Megatron-Core HF importer to load fused ``TELayerNormColumnParallelLinear.layer_norm_weight`` from HF for GPT-family models (Qwen3 etc.) under ``--export-default-te-spec``. Importer now prefers per-context keys ``fused_input_layernorm`` / ``fused_pre_mlp_layernorm`` (fallback ``fused_norm`` for Nemotron-H backward compatibility); ``mcore_qwen.py`` provides the new rules. Without this fix, post-prune MMLU sat at chance.
 
 0.44 (2026-05-14)

@@ -730,11 +730,7 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data
                 # Strip HF metadata from the schema to avoid Feature parsing errors
                 schema = table.schema
                 if schema.metadata and b"huggingface" in schema.metadata:
-                    new_meta = {
-                        k: v
-                        for k, v in schema.metadata.items()
-                        if k != b"huggingface"
-                    }
+                    new_meta = {k: v for k, v in schema.metadata.items() if k != b"huggingface"}
                     table = table.replace_schema_metadata(new_meta or None)
                 dataset = HFDataset(table)
         if self.num_samples is not None and self.num_samples < len(dataset):

@@ -22,9 +22,21 @@
 
 import torch
 from huggingface_hub import snapshot_download
+from huggingface_hub.errors import LocalEntryNotFoundError
 from safetensors.torch import safe_open
 from tqdm import tqdm
 
+_HF_HUB_OFFLINE_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+
+
+def _is_hf_hub_offline() -> bool:
+    return os.environ.get("HF_HUB_OFFLINE", "").strip().upper() in _HF_HUB_OFFLINE_TRUE_VALUES
+
+
+def _copy_python_files(source_dir: Path, save_dir: Path) -> None:
+    for py_file in source_dir.glob("*.py"):
+        shutil.copy2(py_file, save_dir / py_file.name)
+
 
 def copy_hf_ckpt_remote_code(
     pretrained_model_path: str | os.PathLike, save_directory: str | os.PathLike
@@ -36,7 +48,10 @@ def copy_hf_ckpt_remote_code(
     frameworks.
 
     If ``pretrained_model_path`` is a local directory, Python files are copied directly.
-    If it's a HF Hub model ID (e.g. ``nvidia/NVIDIA-Nemotron-Nano-12B-v2``), files are downloaded from the Hub.
+    If it's a HF Hub model ID (e.g. ``nvidia/NVIDIA-Nemotron-Nano-12B-v2``), the Hub
+    snapshot is resolved first and Python files are copied from that snapshot. When
+    ``HF_HUB_OFFLINE`` is set, the snapshot must already be available in the local
+    Hugging Face cache.
 
     Args:
         pretrained_model_path: Local path to the pretrained model or HuggingFace Hub model ID.
@@ -47,14 +62,28 @@ def copy_hf_ckpt_remote_code(
     save_dir.mkdir(parents=True, exist_ok=True)
 
     if hf_checkpoint_path.is_dir():
-        for py_file in hf_checkpoint_path.glob("*.py"):
-            shutil.copy2(py_file, save_dir / py_file.name)
+        _copy_python_files(hf_checkpoint_path, save_dir)
     else:
-        snapshot_download(
-            repo_id=str(pretrained_model_path),
-            local_dir=str(save_dir),
-            allow_patterns=["*.py"],
-        )
+        local_files_only = _is_hf_hub_offline()
+        try:
+            source_dir = Path(
+                snapshot_download(
+                    repo_id=str(pretrained_model_path),
+                    allow_patterns=["*.py"],
+                    local_files_only=local_files_only,
+                )
+            )
+        except LocalEntryNotFoundError as exc:
+            if local_files_only:
+                raise RuntimeError(
+                    f"Could not copy Python sidecar files for {pretrained_model_path!r} because "
+                    "HF_HUB_OFFLINE is enabled and the files are not available in the local "
+                    "Hugging Face cache. Populate the cache with the model's *.py files or pass "
+                    "a local pretrained model directory."
+                ) from exc
+            raise
+
+        _copy_python_files(source_dir, save_dir)
 
 
 def load_multimodal_components(
@@ -123,3 +152,27 @@ def load_multimodal_components(
 
     print(f"Successfully loaded {len(multimodal_state_dict)} multimodal tensors")
     return multimodal_state_dict
+
+
+def copy_non_safetensor_files_from_ckpt(src: str | os.PathLike, dst: str | os.PathLike):
+    """Copy every non-safetensors file from a local HF checkpoint dir verbatim.
+
+    Use as a baseline so tokenizer files, remote_code ``*.py``, README, LICENSE, etc.
+    are preserved from the source. The caller is expected to overwrite the files
+    modelopt owns (``config.json``, ``generation_config.json``, ``hf_quant_config.json``,
+    ``preprocessor_config.json``) after this step.
+
+    Args:
+        src: Source HF checkpoint directory. Must be a local path.
+        dst: Destination directory; created if missing.
+    """
+    if not os.path.isdir(src):
+        raise ValueError(f"Invalid source path: {src}. It should be a directory.")
+    os.makedirs(dst, exist_ok=True)
+    for entry in os.listdir(src):
+        sp = os.path.join(src, entry)
+        if not os.path.isfile(sp):
+            continue
+        if entry.endswith(".safetensors") or entry == "model.safetensors.index.json":
+            continue
+        shutil.copy2(sp, dst)
@@ -131,7 +131,10 @@
     "input_layernorm": NameRemapping("backbone.layers.{}.norm."),
     "linear_qkv": QKVSlicing("backbone.layers.{}.mixer."),
     "linear_proj": NameRemapping("backbone.layers.{}.mixer.o_proj."),
-    "core_attention": SelfAttentionScaling("backbone.layers.{}.mixer."),
+    "core_attention": SelfAttentionScaling(
+        "backbone.layers.{}.mixer.",
+        func_kwargs={"k_scale_name": "k_proj.k_scale", "v_scale_name": "v_proj.v_scale"},
+    ),
     # MLP
     "pre_mlp_layernorm": NameRemapping("backbone.layers.{}.norm."),
     "linear_fc1": NameRemapping("backbone.layers.{}.mixer.up_proj."),

@@ -288,9 +288,25 @@ def _ensure_weight_quantizer_calibrated(
         module_name: Optional module name for better warning messages
     """
     if isinstance(weight_quantizer, NVFP4StaticQuantizer):
-        need_per_block = not hasattr(weight_quantizer, "_amax") or weight_quantizer._amax is None
+
+        def _amax_is_invalid(t: torch.Tensor | None) -> bool:
+            # MCore distcp may register but not fill amax — treat missing/non-finite/negative as recompute.
+            if t is None:
+                return True
+            t = t.detach()
+            if not torch.is_floating_point(t):
+                return False
+            return bool((~torch.isfinite(t) | (t < 0)).any().item())
+
+        need_per_block = (
+            not hasattr(weight_quantizer, "_amax")
+            or weight_quantizer._amax is None
+            or _amax_is_invalid(weight_quantizer._amax)
+        )
         need_global = (
-            not hasattr(weight_quantizer, "_global_amax") or weight_quantizer.global_amax is None
+            not hasattr(weight_quantizer, "_global_amax")
+            or weight_quantizer.global_amax is None
+            or _amax_is_invalid(weight_quantizer.global_amax)
         )
         if not (need_per_block or need_global):
             return