NVIDIA · danielkorzekwa · Mar 17, 2026 · Mar 4, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,6 +44,8 @@ repos:
     rev: v1.17.1
     hooks:
       - id: mypy
+        # Exclude HF config directories to avoid duplicate module errors (e.g., configuration_nemotron_h.py exists in multiple model configs)
+        exclude: "tests/gpu/torch/puzzletron/resources/hf_configs/"
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v21.1.0
@@ -95,6 +97,7 @@ repos:
               modelopt/torch/speculative/eagle/utils.py|
               modelopt/torch/speculative/plugins/transformers.py|
               modelopt/torch/utils/plugins/megatron_mmlu.py|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py|
               examples/chained_optimizations/bert_prune_distill_quantize.py|
               examples/deepseek/quantize_to_nvfp4.py|
               examples/deepseek/ptq.py|
@@ -113,7 +116,6 @@ repos:
               examples/speculative_decoding/server_generate.py|
               experimental/dms/models/qwen3/configuration_qwen3_dms.py|
               experimental/dms/models/qwen3/modeling_qwen3_dms.py|
-              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py|
           )$
 
       # Default hook for Apache 2.0 in c/c++/cuda files

diff --git a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py
@@ -1142,61 +1142,39 @@ def __call__(
 
 
 class Qwen3VLRemoveExpertsIndependentHook(RemoveExpertsIndependentHook):
-    """Expert removal importance hook for Qwen3-VL models.
-
-    TODO: Implement get_router_logits_and_routed_experts based on Qwen3-VL MoE forward pass.
-    """
+    """Expert removal importance hook for Qwen3-VL models."""
 
     def get_router_logits_and_routed_experts(
         self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """Extract router logits and expert outputs for Qwen3-VL MoE.
 
-        Note: This is a placeholder implementation. Implement based on Qwen3VLMoeSparseMoe forward.
+        Based on Qwen3VLMoeSparseMoe forward pass.
         """
-        batch_size = (
-            hidden_states.shape[0] * hidden_states.shape[1]
-            if hidden_states.ndim > 2
-            else hidden_states.shape[0]
-        )
-        router_logits_out = torch.zeros(
-            batch_size, self.num_local_experts, device=hidden_states.device
-        )
-        routed_experts = hidden_states.view(-1, hidden_states.shape[-1])
-        return router_logits_out, routed_experts
+        orig_shape = hidden_states.shape
 
+        # Flatten to (num_tokens, hidden_size) for processing
+        hidden_states_flat = hidden_states.reshape(-1, self.moe.hidden_size)
 
-class GptOssRemoveExpertsIndependentHook(RemoveExpertsIndependentHook):
-    """Expert removal importance hook for GPT-OSS models.
+        if router_logits is None:
+            router_logits = self.moe.gate(hidden_states_flat)
+
+        routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+        routing_weights, router_indices = torch.topk(routing_weights, self.moe.top_k, dim=-1)
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states_flat.dtype)
+        router_weights = torch.zeros_like(router_logits).scatter_(
+            1, router_indices, routing_weights
+        )
 
-    TODO: Implement get_router_logits_and_routed_experts based on GPT-OSS MoE forward pass.
-    This is a placeholder implementation that allows the framework to run.
-    """
+        # Reshape hidden_states for moe.experts (expects 3D: batch, seq, hidden)
+        # router_weights and router_indices remain 2D (num_tokens, num_experts)
+        batch_size = orig_shape[0] if hidden_states.ndim == 3 else 1
+        hidden_states_3d = hidden_states_flat.reshape(batch_size, -1, self.moe.hidden_size)
 
-    def get_router_logits_and_routed_experts(
-        self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Extract router logits and expert outputs for GPT-OSS MoE.
+        routed_out = self.moe.experts(hidden_states_3d, router_weights, router_indices)
 
-        Note: This is a placeholder implementation. For proper expert scoring,
-        implement based on GptOssSparseMoeBlock forward pass.
+        # Return in same shape as input
+        routed_out = routed_out.reshape(*orig_shape)
 
-        Args:
-            hidden_states: Input tensor of shape (batch, seq_len, hidden_dim)
-            router_logits: Optional pre-computed router logits
-
-        Returns:
-            tuple of (router_logits, routed_experts):
-                - router_logits: Shape (num_tokens, num_local_experts) - zeros as placeholder
-                - routed_experts: Original hidden states (no-op)
-        """
-        batch_size = (
-            hidden_states.shape[0] * hidden_states.shape[1]
-            if hidden_states.ndim > 2
-            else hidden_states.shape[0]
-        )
-        router_logits_out = torch.zeros(
-            batch_size, self.num_local_experts, device=hidden_states.device
-        )
-        routed_experts = hidden_states.view(-1, hidden_states.shape[-1])
-        return router_logits_out, routed_experts
+        return router_logits, routed_out
diff --git a/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py
@@ -19,8 +19,11 @@
 
 from typing import Type
 
+import torch
+
 from modelopt.torch.nas.plugins.megatron_hooks.base_hooks import ForwardHook as ActivationsHook
 from modelopt.torch.puzzletron.tools.logger import aprint
+from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock, DummyModule
 
 
 def register_activation_hooks(
@@ -51,6 +54,16 @@ def register_activation_hooks(
     module_names_to_hook = pruning_mixin.get_module_names_to_hook(model)
     activation_hooks = dict()
     for block_idx, module_name in module_names_to_hook:
+        try:
+            module = model.get_submodule(module_name)
+        except AttributeError:
+            # Module doesn't exist on this rank's shard (e.g., in distributed setup)
+            continue
+
+        # Skip dummy modules - they don't have real activations to hook
+        if isinstance(module, (DummyModule, DummyBlock)):
+            continue
+
         block_config = None
         if block_idx is not None:
             block_config = model.config.block_configs[block_idx]
@@ -59,13 +72,25 @@ def register_activation_hooks(
             "block_config": block_config,
         }
 
-        module = model.get_submodule(module_name)
         hook = hook_class(module, curr_activation_hooks_kwargs)
         module.register_forward_hook(hook)
         activation_hooks[module_name] = hook
 
     if len(activation_hooks) == 0:
-        raise ValueError("couldn't find any hooks")
+        # In distributed mode, it's okay for a rank to have 0 hooks if it doesn't own
+        # the target modules (e.g., with hybrid patterns like "*-" where different
+        # ranks own different layer types). However, we still want to catch real bugs
+        # where no hooks are found at all.
+        is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
+        if is_distributed:
+            aprint(
+                "No hooks registered on this rank. This is expected if this rank "
+                "doesn't own any layers matching the hook pattern (e.g., in hybrid "
+                "patterns with distributed model sharding)."
+            )
+        else:
+            raise ValueError("couldn't find any hooks")
 
-    aprint(f"Found the following hooks: {activation_hooks.keys()}")
+    if len(activation_hooks) > 0:
+        aprint(f"Found the following hooks: {activation_hooks.keys()}")
     return activation_hooks
diff --git a/modelopt/torch/puzzletron/anymodel/converter/converter.py b/modelopt/torch/puzzletron/anymodel/converter/converter.py
@@ -135,9 +135,10 @@ def convert_configs_in_dirs(
         cls,
         input_dir: Path,
         output_dir: Path,
+        trust_remote_code: bool = False,
     ):
         """Convert config and add block_configs."""
-        config = load_model_config(input_dir)
+        config = load_model_config(input_dir, trust_remote_code=trust_remote_code)
 
         block_configs = cls.create_block_configs_from_main_config(config)
         out_config = copy.deepcopy(config)
@@ -179,7 +180,10 @@ def convert(
             output_dir: Path to the output AnyModel checkpoint.
         """
         cls.copy_checkpoint_files(input_dir, output_dir)
-        config = cls.convert_configs_in_dirs(input_dir, output_dir)
+        trust_remote_code = descriptor.requires_trust_remote_code()
+        config = cls.convert_configs_in_dirs(
+            input_dir, output_dir, trust_remote_code=trust_remote_code
+        )
         cls.convert_model_weights(
             input_dir, output_dir, descriptor=descriptor, num_hidden_layers=config.num_hidden_layers
         )

diff --git a/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py b/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py
@@ -53,6 +53,18 @@ def block_config_to_layer_overrides(block_config: BlockConfig) -> Dict[str, Any]
         """
         raise NotImplementedError
 
+    @staticmethod
+    def requires_trust_remote_code() -> bool:
+        """Whether this model descriptor requires trust_remote_code=True for loading.
+
+        Models that use custom code (e.g., via auto_map in config) should override
+        this to return True.
+
+        Returns:
+            True if trust_remote_code=True is required, False otherwise.
+        """
+        return False
+
     @staticmethod
     def mlp_no_op_post_init(decoder_layer: nn.Module):
         """Post-init callback to alter a decoder layer so that FFN/mlp subblock performs as no-op.

diff --git a/modelopt/torch/puzzletron/anymodel/models/__init__.py b/modelopt/torch/puzzletron/anymodel/models/__init__.py
@@ -14,11 +14,11 @@
 # limitations under the License.
 
 # Import models to trigger factory registration
-# from modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b import *
+from modelopt.torch.puzzletron.anymodel.models.gpt_oss import *
 from modelopt.torch.puzzletron.anymodel.models.llama import *
-# from modelopt.torch.puzzletron.anymodel.models.mistral_small import *
-# from modelopt.torch.puzzletron.anymodel.models.nemotron_h import *
-# from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen2 import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import *
+from modelopt.torch.puzzletron.anymodel.models.mistral_small import *
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h import *
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import *
+from modelopt.torch.puzzletron.anymodel.models.qwen2 import *
+from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import *
+from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import *
diff --git a/modelopt/torch/puzzletron/anymodel/models/gpt_oss/__init__.py b/modelopt/torch/puzzletron/anymodel/models/gpt_oss/__init__.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""GPT-OSS model support for AnyModel."""
+
+from .gpt_oss_converter import GptOssConverter
+from .gpt_oss_model_descriptor import GptOssModelDescriptor
diff --git a/modelopt/torch/puzzletron/anymodel/models/gpt_oss/gpt_oss_converter.py b/modelopt/torch/puzzletron/anymodel/models/gpt_oss/gpt_oss_converter.py
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+"""GPT-OSS-20B converter for AnyModel compression."""
+
+from typing import List
+
+from transformers import PretrainedConfig
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+    MoEConfig,
+)
+
+
+@ConverterFactory.register_decorator("gpt_oss")
+class GptOssConverter(Converter):
+    """Converter for GPT-OSS models to AnyModel format.
+
+    GPT-OSS is a pure MoE model with 32/128 experts per layer and 4/16 active experts.
+    All layers use MoE FFN (no standard dense FFN layers).
+    """
+
+    quantized = "mxfp4"
+
+    @staticmethod
+    def create_block_configs_from_main_config(config: PretrainedConfig) -> List[BlockConfig]:
+        """Create block configs for GPT-OSS layers.
+
+        GPT-OSS uses MoE for all FFN layers with:
+        - 32/128 local experts (num_local_experts)
+        - 4/16 active experts per token (experts_per_token)
+        - No dense/standard FFN layers
+        """
+        num_hidden_layers = config.num_hidden_layers
+        num_local_experts = config.num_local_experts
+        experts_per_token = config.experts_per_token
+        intermediate_size = config.intermediate_size
+
+        block_configs = []
+        for layer_idx in range(num_hidden_layers):
+            block_config = BlockConfig(
+                attention=AttentionConfig(
+                    no_op=False, num_key_value_heads=config.num_key_value_heads
+                ),
+                ffn=FFNConfig(
+                    no_op=False,
+                    intermediate_size=None,  # MoE doesn't use this field
+                    moe=MoEConfig(
+                        num_local_experts=num_local_experts,
+                        num_experts_per_tok=experts_per_token,
+                        expert_intermediate_dim=intermediate_size,
+                    ),
+                ),
+            ).to_dict()
+            block_configs.append(block_config)
+
+        return block_configs