-
Notifications
You must be signed in to change notification settings - Fork 429
Support Mixed precision & Static MSE in MCore; Nemotron Super v3 NVFP4 recipe #1521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
676eac4
9143e02
57e5f26
2dea94a
918ed6a
05a436f
ff20eca
985da85
d88e54a
5f291a6
5c9cd43
c5c7a2e
d63bf70
e14fa62
e985e93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,9 +22,21 @@ | |
|
|
||
| import torch | ||
| from huggingface_hub import snapshot_download | ||
| from huggingface_hub.errors import LocalEntryNotFoundError | ||
| from safetensors.torch import safe_open | ||
| from tqdm import tqdm | ||
|
|
||
| _HF_HUB_OFFLINE_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} | ||
|
|
||
|
|
||
| def _is_hf_hub_offline() -> bool: | ||
| return os.environ.get("HF_HUB_OFFLINE", "").strip().upper() in _HF_HUB_OFFLINE_TRUE_VALUES | ||
|
|
||
|
|
||
| def _copy_python_files(source_dir: Path, save_dir: Path) -> None: | ||
| for py_file in source_dir.glob("*.py"): | ||
| shutil.copy2(py_file, save_dir / py_file.name) | ||
|
Comment on lines
+36
to
+38
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Copy Python sidecars recursively to avoid missing package modules. Line 37 only scans top-level 💡 Proposed fix def _copy_python_files(source_dir: Path, save_dir: Path) -> None:
- for py_file in source_dir.glob("*.py"):
- shutil.copy2(py_file, save_dir / py_file.name)
+ for py_file in source_dir.rglob("*.py"):
+ rel = py_file.relative_to(source_dir)
+ target = save_dir / rel
+ target.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copy2(py_file, target)🤖 Prompt for AI Agents |
||
|
|
||
|
|
||
| def copy_hf_ckpt_remote_code( | ||
| pretrained_model_path: str | os.PathLike, save_directory: str | os.PathLike | ||
|
|
@@ -36,7 +48,10 @@ def copy_hf_ckpt_remote_code( | |
| frameworks. | ||
|
|
||
| If ``pretrained_model_path`` is a local directory, Python files are copied directly. | ||
| If it's a HF Hub model ID (e.g. ``nvidia/NVIDIA-Nemotron-Nano-12B-v2``), files are downloaded from the Hub. | ||
| If it's a HF Hub model ID (e.g. ``nvidia/NVIDIA-Nemotron-Nano-12B-v2``), the Hub | ||
| snapshot is resolved first and Python files are copied from that snapshot. When | ||
| ``HF_HUB_OFFLINE`` is set, the snapshot must already be available in the local | ||
| Hugging Face cache. | ||
|
|
||
| Args: | ||
| pretrained_model_path: Local path to the pretrained model or HuggingFace Hub model ID. | ||
|
|
@@ -47,14 +62,28 @@ def copy_hf_ckpt_remote_code( | |
| save_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| if hf_checkpoint_path.is_dir(): | ||
| for py_file in hf_checkpoint_path.glob("*.py"): | ||
| shutil.copy2(py_file, save_dir / py_file.name) | ||
| _copy_python_files(hf_checkpoint_path, save_dir) | ||
| else: | ||
| snapshot_download( | ||
| repo_id=str(pretrained_model_path), | ||
| local_dir=str(save_dir), | ||
| allow_patterns=["*.py"], | ||
| ) | ||
| local_files_only = _is_hf_hub_offline() | ||
| try: | ||
| source_dir = Path( | ||
| snapshot_download( | ||
| repo_id=str(pretrained_model_path), | ||
| allow_patterns=["*.py"], | ||
| local_files_only=local_files_only, | ||
| ) | ||
| ) | ||
| except LocalEntryNotFoundError as exc: | ||
| if local_files_only: | ||
| raise RuntimeError( | ||
| f"Could not copy Python sidecar files for {pretrained_model_path!r} because " | ||
| "HF_HUB_OFFLINE is enabled and the files are not available in the local " | ||
| "Hugging Face cache. Populate the cache with the model's *.py files or pass " | ||
| "a local pretrained model directory." | ||
| ) from exc | ||
| raise | ||
|
|
||
| _copy_python_files(source_dir, save_dir) | ||
|
|
||
|
|
||
| def load_multimodal_components( | ||
|
|
@@ -123,3 +152,27 @@ def load_multimodal_components( | |
|
|
||
| print(f"Successfully loaded {len(multimodal_state_dict)} multimodal tensors") | ||
| return multimodal_state_dict | ||
|
|
||
|
|
||
| def copy_non_safetensor_files_from_ckpt(src: str | os.PathLike, dst: str | os.PathLike): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [SUGGESTION] The current implementation copies everything non-safetensors from the source — including The risk is the load-bearing convention. If a future refactor adds a guarded path in Two safer alternatives:
_MODELOPT_OWNED_FILES = frozenset({
"config.json",
"generation_config.json",
"hf_quant_config.json",
"preprocessor_config.json",
})
def copy_non_safetensor_files_from_ckpt(src, dst):
...
for entry in os.listdir(src):
if entry in _MODELOPT_OWNED_FILES:
continue
if entry.endswith(".safetensors") or entry == "model.safetensors.index.json":
continue
...
Option 1 removes the silent-failure mode entirely without changing today's behavior. |
||
| """Copy every non-safetensors file from a local HF checkpoint dir verbatim. | ||
|
|
||
| Use as a baseline so tokenizer files, remote_code ``*.py``, README, LICENSE, etc. | ||
| are preserved from the source. The caller is expected to overwrite the files | ||
| modelopt owns (``config.json``, ``generation_config.json``, ``hf_quant_config.json``, | ||
| ``preprocessor_config.json``) after this step. | ||
|
|
||
| Args: | ||
| src: Source HF checkpoint directory. Must be a local path. | ||
| dst: Destination directory; created if missing. | ||
| """ | ||
| if not os.path.isdir(src): | ||
| raise ValueError(f"Invalid source path: {src}. It should be a directory.") | ||
| os.makedirs(dst, exist_ok=True) | ||
| for entry in os.listdir(src): | ||
| sp = os.path.join(src, entry) | ||
| if not os.path.isfile(sp): | ||
| continue | ||
| if entry.endswith(".safetensors") or entry == "model.safetensors.index.json": | ||
| continue | ||
| shutil.copy2(sp, dst) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just a linter change