Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e82164f
Add anymodel directories to feature/puzzletron
danielkorzekwa Mar 4, 2026
2099df3
Make any_model conversion working.
danielkorzekwa Mar 5, 2026
eb5cf8a
Update child_init.py with anymodel version
danielkorzekwa Mar 5, 2026
c9de41c
fix attention pruning
danielkorzekwa Mar 5, 2026
3c1bc1f
Add trust_remote_code to load_model_config (default to false)
danielkorzekwa Mar 5, 2026
6cc2194
Comment all tested models aside of llama_3_1_8b_instruct
danielkorzekwa Mar 5, 2026
ee4e1e3
Delete not needed decilm test
danielkorzekwa Mar 5, 2026
449b523
Fix broken tests
danielkorzekwa Mar 5, 2026
fb27bba
Update puzzletron_nas_pluging to any_model version
danielkorzekwa Mar 5, 2026
b350f82
Correct test resources used by tests.
danielkorzekwa Mar 5, 2026
fafe5a3
Disable puzzletron tests (will be enabled after all any_model logic i…
danielkorzekwa Mar 5, 2026
c717852
Comment out not implemented models.
danielkorzekwa Mar 6, 2026
030f126
format python docs
danielkorzekwa Mar 6, 2026
70df0df
Use trust_remote_code in force_cache_dynamic_modules()
danielkorzekwa Mar 6, 2026
ee8f538
Fix buid docs issue.
danielkorzekwa Mar 6, 2026
47414d5
Clarify readme and avoid reusing the same reference in llama_converter.
danielkorzekwa Mar 9, 2026
a8305d8
Fix tied-embedding handling before writing the safetensors index.
danielkorzekwa Mar 9, 2026
68421a5
Fix NaN ranking currently selects NaNs as “best” experts by default.
danielkorzekwa Mar 9, 2026
d6b8028
Code clean up.
danielkorzekwa Mar 9, 2026
ecd2341
Code clean up.
danielkorzekwa Mar 10, 2026
f9d845d
code clean up
danielkorzekwa Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions modelopt/torch/puzzletron/anymodel/converter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Converters for transforming HuggingFace models to AnyModel format."""

from .convert_any_model import *
from .converter import *
from .converter_factory import *
68 changes: 68 additions & 0 deletions modelopt/torch/puzzletron/anymodel/converter/convert_any_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# mypy: ignore-errors

"""Convert a HuggingFace model to AnyModel format."""

from pathlib import Path

from modelopt.torch.puzzletron.anymodel.converter.converter import Converter
from modelopt.torch.puzzletron.anymodel.converter.converter_factory import ConverterFactory
from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptorFactory

__all__ = ["convert_model"]


def convert_model(
input_dir: str,
output_dir: str,
converter: Converter | str,
):
"""Convert a HuggingFace model to AnyModel format.

This function converts a HuggingFace checkpoint to the AnyModel format used
for compression. The conversion process:

1. Copies non-weight files (config, tokenizer, etc.)
2. Creates block_configs for each layer
3. Reorganizes weights into subblock checkpoints

Args:
input_dir: Path to the input HuggingFace checkpoint directory.
output_dir: Path to the output AnyModel checkpoint directory.
converter: Either a converter name (e.g., "llama") or a Converter class.

Example:
>>> convert_model(
... input_dir="/path/to/Llama-3.1-8B-Instruct",
... output_dir="/path/to/output/ckpts/teacher",
... converter="llama",
... )
"""
input_dir = Path(input_dir)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

# Get descriptor and converter from factories (they use the same name)
descriptor = ModelDescriptorFactory.get(converter)
converter = ConverterFactory.get(converter)

converter.convert(descriptor=descriptor, input_dir=input_dir, output_dir=output_dir)
Comment on lines +31 to +62

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

converter API contract is currently inconsistent with implementation.

The function documents support for passing a Converter class, but the current resolution path expects a registry key and can produce an invalid descriptor object when a class is passed.

✅ Proposed fix (restrict and validate)
 def convert_model(
     input_dir: str,
     output_dir: str,
-    converter: Converter | str,
+    converter: str,
 ):
@@
-    descriptor = ModelDescriptorFactory.get(converter)
-    converter = ConverterFactory.get(converter)
-
-    converter.convert(descriptor=descriptor, input_dir=input_dir, output_dir=output_dir)
+    descriptor = ModelDescriptorFactory.get(converter)
+    converter_cls = ConverterFactory.get(converter)
+    converter_cls.convert(descriptor=descriptor, input_dir=input_dir, output_dir=output_dir)

Also update the docstring to state converter is a registered converter name.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@modelopt/torch/puzzletron/anymodel/converter/convert_any_model.py` around
lines 31 - 62, The converter parameter is documented as accepting a Converter
class but the implementation calls ModelDescriptorFactory.get and
ConverterFactory.get and will break if a class is passed; change the API
contract to require a registered converter name (string) by validating that
converter is an instance of str at the start of convert_model and raising a
clear TypeError if not, update the docstring to state "converter is a registered
converter name (str)" and adjust any variable names/comments around
ModelDescriptorFactory.get and ConverterFactory.get to reflect that they expect
a registry key rather than a class.



if __name__ == "__main__":
from fire import Fire

Fire(convert_model)
235 changes: 235 additions & 0 deletions modelopt/torch/puzzletron/anymodel/converter/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# mypy: ignore-errors

import copy
import fnmatch
import json
import os
import shutil
from abc import ABC, abstractmethod
from collections import defaultdict
from pathlib import Path
from typing import Dict, List

from safetensors.torch import load_file, save_file
from tqdm import tqdm
from transformers import PretrainedConfig
from transformers.integrations.mxfp4 import convert_moe_packed_tensors

from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor
from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import load_model_config, save_model_config

__all__ = ["Converter"]


class Converter(ABC):
"""Base class for converting HuggingFace models to Puzzletron/AnyModel format."""

@staticmethod
def _get_weight_map(input_dir: Path) -> Dict[str, str]:
"""Load weight map from checkpoint directory (supports both sharded and single-file models).

Returns a dict mapping parameter names to their safetensors filenames.
"""
index_path = input_dir / "model.safetensors.index.json"
single_file_path = input_dir / "model.safetensors"

if index_path.exists():
# Sharded model
with open(index_path, "r") as f:
index = json.load(f)
return index["weight_map"]
elif single_file_path.exists():
# Single file model - create a synthetic weight map
data = load_file(single_file_path)
return {name: "model.safetensors" for name in data.keys()}
else:
raise FileNotFoundError(
f"Neither {index_path} nor {single_file_path} found. Cannot determine model format."
)

@classmethod
def convert_model_weights(
cls, input_dir: Path, output_dir: Path, descriptor: ModelDescriptor, num_hidden_layers: int
):
"""Convert model weights to subblock format."""
param_to_file = Converter._get_weight_map(input_dir)
all_param_names = list(param_to_file.keys())

# Reverse map: file -> set of params
file_to_params = defaultdict(set)
for name, file in param_to_file.items():
file_to_params[file].add(name)

# Determine subblocks needed
subblocks = descriptor.get_weight_groups(
all_param_names, num_hidden_layers=num_hidden_layers
)

# Output directory
out_dir = output_dir / "subblocks_safetensors"
os.makedirs(out_dir, exist_ok=True)

# New weight index
new_index = {"metadata": {"format": "pt"}, "weight_map": {}}

for subblock, param_names in tqdm(subblocks.items(), desc="Processing subblocks"):
param_files = set(param_to_file[name] for name in param_names)
tensors = {}

# Load only needed files for this subblock
for file in param_files:
data = load_file(os.path.join(input_dir, file))
for name in param_names:
if param_to_file[name] == file and name in data:
converted_name = cls.convert_weight_name(name)
# Convert MoE packed tensors if quantized is mxfp4 //gpt-oss-20b
if getattr(cls, "quantized", None) == "mxfp4":
if name.endswith("_blocks"):
converted_name = converted_name.replace("_blocks", "")
tensors[converted_name] = convert_moe_packed_tensors(
data[converted_name + "_blocks"],
data[converted_name + "_scales"],
)
elif name.endswith("_scales"):
continue
else:
tensors[converted_name] = data[name]
else:
tensors[converted_name] = data[name]
Comment on lines +100 to +113

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# First, let's locate and examine the converter.py file
fd -t f "converter.py" -p "modelopt/torch/puzzletron"

Repository: NVIDIA/Model-Optimizer

Length of output: 190


🏁 Script executed:

#!/bin/bash
# Examine the converter.py file around lines 100-113 with broader context
cat -n modelopt/torch/puzzletron/anymodel/converter/converter.py | head -150 | tail -100

Repository: NVIDIA/Model-Optimizer

Length of output: 4863


🏁 Script executed:

#!/bin/bash
# Search for convert_moe_packed_tensors function definition and usage
rg -n "def convert_moe_packed_tensors|convert_moe_packed_tensors" --type=py -B2 -A5

Repository: NVIDIA/Model-Optimizer

Length of output: 1960


🏁 Script executed:

#!/bin/bash
# Look for tests related to MoE conversion
fd -t f "test_*.py" -o "*_test.py" | xargs rg -l "moe|MoE" 2>/dev/null

Repository: NVIDIA/Model-Optimizer

Length of output: 5021


🏁 Script executed:

#!/bin/bash
# Search for convert_weight_name implementation
rg -n "def convert_weight_name" --type=py -A10

Repository: NVIDIA/Model-Optimizer

Length of output: 1214


🏁 Script executed:

#!/bin/bash
# Search for subclasses that override convert_weight_name
rg -n "def convert_weight_name" --type=py -A5

Repository: NVIDIA/Model-Optimizer

Length of output: 742


🏁 Script executed:

#!/bin/bash
# Search for mxfp4 quantization usage and MoE conversion tests
rg -n "mxfp4|convert_moe_packed_tensors" --type=py -B3 -A3

Repository: NVIDIA/Model-Optimizer

Length of output: 19846


🏁 Script executed:

#!/bin/bash
# Look for llama converter which might use the base Converter
cat -n modelopt/torch/puzzletron/anymodel/models/llama/llama_converter.py | head -50

Repository: NVIDIA/Model-Optimizer

Length of output: 2273


🏁 Script executed:

#!/bin/bash
# Search for actual MoE checkpoint structure documentation or examples
rg -n "moe.*blocks.*scales|_blocks.*_scales" --type=py -B2 -A2 | head -50

Repository: NVIDIA/Model-Optimizer

Length of output: 1230


🏁 Script executed:

#!/bin/bash
# Check if there are any actual tests that exercise the MoE conversion path with real data
rg -n "convert_model_weights.*mxfp4|gpt-oss-20b" --type=py -B3 -A3

Repository: NVIDIA/Model-Optimizer

Length of output: 3999


🏁 Script executed:

#!/bin/bash
# Verify the logic by examining the exact flow: what names come from param_to_file
cat -n modelopt/torch/puzzletron/anymodel/converter/converter.py | sed -n '65,130p'

Repository: NVIDIA/Model-Optimizer

Length of output: 3434


🏁 Script executed:

#!/bin/bash
# Check how param_to_file is built to understand what names it contains
cat -n modelopt/torch/puzzletron/anymodel/converter/converter.py | sed -n '43,63p'

Repository: NVIDIA/Model-Optimizer

Length of output: 1158


🏁 Script executed:

#!/bin/bash
# Verify by checking if any existing code might override convert_weight_name for MoE models
rg -n "class.*Converter.*:|quantized.*=" modelopt/torch/puzzletron --type=py -A5 | grep -A5 "quantized"

Repository: NVIDIA/Model-Optimizer

Length of output: 3733


🏁 Script executed:

#!/bin/bash
# Final verification: check the exact structure of what gets passed to convert_moe_packed_tensors
rg -n "def convert_moe_packed_tensors" --type=py -A10 -B2

Repository: NVIDIA/Model-Optimizer

Length of output: 48


Use original parameter name to construct data dict keys in MoE conversion logic.

Lines 105-106 access data[converted_name + "_blocks"] and data[converted_name + "_scales"], but the data dict is keyed by original checkpoint parameter names. The converted_name is derived from applying convert_weight_name() transformation, so any subclass override that transforms the parameter prefix would cause a KeyError at runtime.

Fix: Construct data keys using the original name parameter:

data[name],
data[name.replace("_blocks", "_scales")]

Reserve converted_name for storing results in the tensors output dict. This ensures data access always uses original checkpoint names while transformations are applied only to output keys.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@modelopt/torch/puzzletron/anymodel/converter/converter.py` around lines 100 -
113, The MoE conversion is indexing the source checkpoint dict using
converted_name which can differ from the original checkpoint keys; update the
branch in converter.py (inside the loop that handles MoE packed tensors) to read
from data using the original parameter name (the variable name) when fetching
the "_blocks" and "_scales" entries, and continue to use converted_name only as
the key for storing the result into tensors (i.e., use name to build the data
lookups like the "_blocks"/"_scales" access and assign tensors[converted_name] =
convert_moe_packed_tensors(...)).


# Save this subblock
print(f"\n✅ Group: {subblock} ({len(tensors)} layers)")
for layer in tensors.keys():
print(f" - {layer}")

subblock_file = f"{subblock}.safetensors"
save_file(tensors, os.path.join(out_dir, subblock_file))

# Update index
for new_name in tensors.keys():
new_index["weight_map"][new_name] = f"subblocks_safetensors/{subblock_file}"

# Save new index file
with (output_dir / "model.safetensors.index.json").open("w") as f:
json.dump(new_index, f, indent=2)

print(f"✅ Finished saving subblocks and index to {output_dir}")

@classmethod
def convert_configs_in_dirs(
cls,
input_dir: Path,
output_dir: Path,
):
"""Convert config and add block_configs."""
config = load_model_config(input_dir)

block_configs = cls.create_block_configs_from_main_config(config)
out_config = copy.deepcopy(config)
out_config.block_configs = block_configs

save_model_config(out_config, output_dir)
return out_config

@staticmethod
def copy_checkpoint_files(input_dir: Path, output_dir: Path):
"""Copy checkpoint files except model weights (which will be converted)."""
ignore_patterns = [
"model-*.safetensors",
"model.safetensors",
"model.safetensors.index.json",
"subblocks_safetensors",
]

def ignore_func(dir, files):
ignored = set()
for pattern in ignore_patterns:
ignored.update(fnmatch.filter(files, pattern))
return ignored

shutil.copytree(str(input_dir), str(output_dir), ignore=ignore_func, dirs_exist_ok=True)

@classmethod
def convert(
cls,
descriptor: ModelDescriptor,
input_dir: Path,
output_dir: Path,
):
"""Convert a HuggingFace model to AnyModel format.

Args:
descriptor: Model descriptor for the model type.
input_dir: Path to the input HuggingFace checkpoint.
output_dir: Path to the output AnyModel checkpoint.
"""
cls.copy_checkpoint_files(input_dir, output_dir)
config = cls.convert_configs_in_dirs(input_dir, output_dir)
cls.convert_model_weights(
input_dir, output_dir, descriptor=descriptor, num_hidden_layers=config.num_hidden_layers
)

@staticmethod
@abstractmethod
def create_block_configs_from_main_config(config: PretrainedConfig) -> List[BlockConfig]:
"""Create per-layer BlockConfig list from a HuggingFace model config.

This method extracts layer-specific parameters (e.g., intermediate_size,
num_key_value_heads) from the main model config and creates a BlockConfig
for each layer. These BlockConfigs enable layer-specific pruning and
modifications during the compression pipeline.

Args:
config: HuggingFace PretrainedConfig (e.g., LlamaConfig, Qwen2Config)

Returns:
List of BlockConfig, one per hidden layer. Each BlockConfig contains:
- AttentionConfig: attention settings (no_op, num_key_value_heads)
- FFNConfig: FFN settings (no_op, intermediate_size)

Example:
For a model with uniform layers (e.g., Llama):
return [BlockConfig(...)] * config.num_hidden_layers

For a model with heterogeneous layers (e.g., NemotronH with Mamba/Attention):
return [BlockConfig(...) for layer_idx in range(num_layers)]
"""
raise NotImplementedError

@staticmethod
def convert_weight_name(name: str) -> str:
"""
Convert weight names during checkpoint conversion.

This method can be overridden by subclasses to apply model-specific weight name
transformations when converting checkpoints from HuggingFace format to Puzzletron format.

Default implementation returns the name unchanged (identity function).

Args:
name: Original weight name from HuggingFace checkpoint

Returns:
Converted weight name for Puzzletron format

Example:
For Qwen2.5-VL, this converts:
- visual.* → model.visual.*
- model.* → model.language_model.*
"""
return name
Loading