NVIDIA · danielkorzekwa · Oct 31, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
@@ -19,6 +19,7 @@
 #!/usr/bin/env python3
 from pathlib import Path
 
+import torch
 from fire import Fire
 from puzzle_tools.checkpoint_utils import copy_tokenizer
 from puzzle_tools.checkpoint_utils_hf import copy_deci_lm_hf_code
@@ -46,7 +47,7 @@ def convert_llama3_config_to_decilm_config(config: LlamaConfig) -> DeciLMConfig:
         dtype = getattr(config, "torch_dtype", None)
 
     # Convert torch.dtype to string if needed (for JSON serialization)
-    if dtype is not None and hasattr(dtype, "__module__") and "torch" in dtype.__module__:
+    if dtype is not None and isinstance(dtype, torch.dtype):
         dtype = str(dtype).replace("torch.", "")
 
     # Track which global values will be removed (moved to per-layer configs)

diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Compress NAS plugin for the Modelopt framework (based on Puzzle algorithm: https://arxiv.org/abs/2411.19146).
+"""
+
+import datetime
+from pathlib import Path
+
+import pruning_ckpts
+import score_pruning_activations
+import torch
+from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
+from torch import nn
+
+from modelopt.torch._compress.runtime import NativeDdpRuntime
+from modelopt.torch.nas.conversion import NASModeRegistry
+from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
+from modelopt.torch.opt.mode import (
+    ConvertEntrypoint,
+    ConvertReturnType,
+    MetadataDict,
+    ModeDescriptor,
+    RestoreEntrypoint,
+)
+from modelopt.torch.opt.searcher import BaseSearcher
+
+# TODO Move initialize_hydra_config_for_dir from tests to main
+from tests.utils.test_utils import initialize_hydra_config_for_dir
+
+
+class CompressModel(nn.Module):
+    pass  # No model implementation is needed for the compress mode
+
+
+class CompressConfig(ModeloptBaseConfig):
+    """Configuration for Compress NAS algorithm."""
+
+    # Input model path to compress in the HF format
+    input_model_path: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+    # Hydra config directory containing the search space definition
+    hydra_config_dir: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+    # Hydra config name containing the search space definition
+    hydra_config_name: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+    # Directory to save the compressed model and intermediate results
+    puzzle_dir: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+    # Dataset path to use for scoring in prunining and NAS search
+    dataset_path: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+
+def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType:
+    """1. Convert the model from HF format to DeciLM format.
+    2. Score the pruning activations.
+    3. Prune the model and save pruned checkpoints
+
+    The output of this step will be used by mnt.search() to perform the NAS search.
+    """
+    runtime = NativeDdpRuntime(
+        dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
+    )
+
+    # Load hydra config
+    hydra_cfg = initialize_hydra_config_for_dir(
+        config_dir=config.hydra_config_dir,
+        config_name=config.hydra_config_name,
+        overrides=[
+            f"puzzle_dir={config.puzzle_dir}",
+            f"dataset_path={config.dataset_path}",
+        ],
+    )
+
+    # Convert Llama3 model to DeciLM model
+    hf_ckpt_teacher_dir = "ckpts/teacher"  # TODO: make it configurable
+    convert_llama3_to_decilm(
+        input_dir=config.input_model_path,
+        output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,
+    )
+
+    # Score_pruning_activations (distributed processing)
+    score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
+
+    # Prune the model and save pruned checkpoints
+    if runtime.global_rank == 0:
+        pruning_ckpts.launch_prune_ckpt(hydra_cfg)
+    runtime.wait_for_everyone()
+
+    return model, {}
+
+
+def restore_compress_model(
+    model: nn.Module, config: CompressConfig, metadata: MetadataDict
+) -> nn.Module:
+    """Restore is not needed for the compress mode as we are not saving any model state"""
+    return model
+
+
+@NASModeRegistry.register_mode
+class CompressDescriptor(ModeDescriptor):
+    """Descriptor for the Compress mode."""
+
+    @property
+    def name(self) -> str:
+        """String identifier for this mode."""
+        return "compress"
+
+    @property
+    def config_class(self) -> type[ModeloptBaseConfig]:
+        """Configuration class for this mode."""
+        return CompressConfig
+
+    @property
+    def search_algorithm(self) -> type[BaseSearcher]:
+        """Return the associated searcher implementation."""
+        raise NotImplementedError("Compress mode does not have a search algorithm yet.")
+
+    @property
+    def convert(self) -> ConvertEntrypoint:
+        """Entrypoint to convert a model."""
+        return convert_compress_model
+
+    @property
+    def restore(self) -> RestoreEntrypoint:
+        """Entrypoint to restore a model."""
+        return restore_compress_model
+
+    @property
+    def export_mode(self) -> str | None:
+        """The mode that corresponds to the export mode.
+        For now, this will be a no-op as there is no modelopt's concept of search space defined
+        for the compress algorithm.
+        """
+        return "export_nas"
diff --git a/tests/experimental/torch/_compress/conftest.py b/tests/experimental/torch/_compress/conftest.py
@@ -13,108 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import shutil
 from pathlib import Path
 
 import pytest
-import torch
-from datasets import Dataset, DatasetDict
-from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
 
 
 @pytest.fixture
 def project_root_path(request: pytest.FixtureRequest) -> Path:
     """Fixture providing the project root path for tests."""
     return Path(request.config.rootpath)
-
-
-def create_and_save_small_llama_model(
-    output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
-):
-    """
-    Create and save a small Llama model for testing the conversion pipeline.
-    This mimics having a real Llama checkpoint that needs to be converted.
-    """
-    os.makedirs(output_path, exist_ok=True)
-
-    # Create a minimal Llama config (small for testing)
-    # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
-    # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
-    llama_config = LlamaConfig(
-        vocab_size=vocab_size,
-        hidden_size=256,  # 32 heads times 8 head_dim = 256 (matches bypass config expectations)
-        intermediate_size=512,  # Must be divisible by 256
-        num_hidden_layers=2,
-        num_attention_heads=32,  # Matches original test
-        num_key_value_heads=8,  # GQA: 32÷4=8 (matches original n_heads_in_group=4)
-        max_position_embeddings=512,
-        rms_norm_eps=1e-5,
-        rope_theta=10000.0,
-        attention_bias=False,
-        hidden_act="silu",
-        tie_word_embeddings=False,
-    )
-
-    # Create and save the Llama model
-    model = LlamaForCausalLM(llama_config)
-    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
-
-    # Save tokenizer
-    tokenizer.save_pretrained(output_path)
-
-    # Save config
-    llama_config.save_pretrained(output_path)
-
-
-def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase:
-    """
-    Create a tokenizer for the Llama model.
-    """
-    tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-    return tokenizer
-
-
-def setup_puzzle_dir(puzzle_dir: str):
-    if Path(puzzle_dir).exists():
-        shutil.rmtree(puzzle_dir)
-        Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
-
-
-def save_dummy_dataset(dataset_path: str):
-    # dummy sample
-    sample = [
-        {"role": "user", "content": "please cite Lorem Ipsum?"},
-        {
-            "role": "assistant",
-            "content": (
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. "
-                "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, "
-                "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, "
-                "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, "
-                "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. "
-                "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, "
-                "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. "
-                "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, "
-                "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. "
-                "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, "
-                "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. "
-                "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. "
-                "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. "
-                "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. "
-                "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. "
-                "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. "
-                "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. "
-                "Donec mollis convallis massa quis iaculis."
-            ),
-        },
-    ]
-
-    # Prepare train and val splits with sample repeated, 2500 samples are for
-    # 128 samples with block-size 8192 and LLama3 tokenizer
-    data = [{"conversation": sample}] * 2500
-
-    # For train-val splits
-    data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
-    data_dict.save_to_disk(dataset_path)
diff --git a/...rimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/...rimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
@@ -16,7 +16,7 @@
 import json
 from pathlib import Path
 
-from experimental.torch._compress.conftest import (
+from experimental.torch._compress.test_utils import (
     create_and_save_small_llama_model,
     create_tokenizer,
 )