NVIDIA-NeMo · chtruong814 · Feb 25, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/examples/conversion/hf_megatron_roundtrip_multi_gpu.py b/examples/conversion/hf_megatron_roundtrip_multi_gpu.py
@@ -58,6 +58,12 @@
 HF_MODEL_ID = "meta-llama/Llama-3.2-1B"
 console = Console()
 
+# Parameters where Megatron and HF may use different dtypes.
+# These are compared in float32 to avoid false mismatches.
+IGNORE_PRECISION_PARAMS = [
+    "e_score_correction_bias",
+]
+
 
 @torchrun_main
 def main(
@@ -155,8 +161,15 @@ def main(
     for name, param in bridge.export_hf_weights(megatron_model, show_progress=False):
         if is_rank_0:
             original_param = bridge.hf_pretrained.state[name]
+            compare_param = param
+            compare_original = original_param
+            # Cast to float32 for params with known dtype mismatches between Megatron and HF
+            # (e.g. Megatron keeps expert_bias in float32 while HF may use bfloat16)
+            if any(p in name for p in IGNORE_PRECISION_PARAMS):
+                compare_param = param.float()
+                compare_original = original_param.float()
             match = torch.allclose(
-                param, original_param.to(param.device), atol=1e-1
+                compare_param, compare_original.to(compare_param.device), atol=1e-1
             )  # Increased tolerance for bfloat16
             all_match = all_match and match
             table.add_row(

@@ -66,7 +66,7 @@ classifiers = [
     "Topic :: Utilities",
 ]
 dependencies = [
-    "transformers<5.0.0",
+    "transformers>=5.0.0",
     "datasets>=2.20.0",
     "accelerate",
     "omegaconf>=2.3.0",

diff --git a/src/megatron/bridge/models/conversion/model_bridge.py b/src/megatron/bridge/models/conversion/model_bridge.py
@@ -51,6 +51,9 @@
     MegatronParamMapping,
 )
 from megatron.bridge.models.conversion.peft_bridge import AdapterWeightConversionTask, MegatronPeftBridge
+from megatron.bridge.models.conversion.transformers_compat import (
+    rope_theta_from_hf,
+)
 from megatron.bridge.models.conversion.utils import (
     extract_sort_key,
     get_module_and_param_from_name,
@@ -364,6 +367,14 @@ def hf_config_to_provider_kwargs(self, hf_config) -> dict:
             if value is not None:
                 provider_kwargs[megatron_name] = value
 
+        # Extract rotary_base via compat function (handles both legacy rope_theta
+        # attribute and transformers 5.0+ rope_parameters dict)
+        if "rotary_base" not in provider_kwargs:
+            try:
+                provider_kwargs["rotary_base"] = rope_theta_from_hf(hf_config)
+            except ValueError:
+                pass
+
         # Handle rope scaling: extract params from rope_scaling dict
         # HF configs use either "type" or "rope_type" key for the scaling type
         from megatron.bridge.models.mla_provider import MLAModelProvider
@@ -945,7 +956,7 @@ def stream_weights_megatron_to_hf(
         megatron_to_hf_tasks = conversion_tasks
         unwrapped_model = unwrap_model(megatron_model)[0]
         model_config = unwrapped_model.config
-        embeddings_are_tied = self._share_embeddings_and_output_weights(model_config, unwrapped_model)
+        embeddings_are_tied = self._share_embeddings_and_output_weights(model_config)
 
         hf_state_dict: Mapping[str, torch.Tensor] = hf_pretrained.state if hasattr(hf_pretrained, "state") else {}
 
@@ -1107,11 +1118,11 @@ def _get_provider_from_model(self, model: MegatronModule) -> ModelProviderTarget
         return model.config
 
     def _share_embeddings_and_output_weights(
-        self, model_config: TransformerConfig, model: Optional[MegatronModule]
+        self,
+        model_config: TransformerConfig,
     ) -> bool:
-        """Fallback-aware accessor for shared embedding setting."""
-        fallback = getattr(model, "share_embeddings_and_output_weights", False) if model else False
-        return getattr(model_config, "share_embeddings_and_output_weights", fallback)
+        """Shared embedding setting."""
+        return getattr(model_config, "share_embeddings_and_output_weights")
 
     def _unwrap_name(self, name: str) -> str:
         """Unwrap name from DDP or other wrappers.
@@ -1147,7 +1158,7 @@ def _broadcast_shared_embeddings(self, megatron_model: Union[MegatronModel, List
         if hasattr(unwrapped_model, "language_model") and unwrapped_model.language_model is not None:
             unwrapped_model = unwrapped_model.language_model
         model_config = unwrapped_model.config
-        share_embeddings = self._share_embeddings_and_output_weights(model_config, unwrapped_model)
+        share_embeddings = self._share_embeddings_and_output_weights(model_config)
 
         # TODO(yuya): Fix for VPP, the vp stage needs to be passed in for stage checks
         if (share_embeddings and model_config.pipeline_model_parallel_size > 1) and (
@@ -1190,7 +1201,7 @@ def build_conversion_tasks(
         mapping_registry = self.mapping_registry()
         unwrapped_model = unwrap_model(megatron_model)[0]
         model_config = unwrapped_model.config
-        embeddings_are_tied = self._share_embeddings_and_output_weights(model_config, unwrapped_model)
+        embeddings_are_tied = self._share_embeddings_and_output_weights(model_config)
         pp_rank = parallel_state.get_pipeline_model_parallel_rank()
         sorted_global_param_names_all_pp_ranks = self._megatron_global_param_names_all_pp_ranks(megatron_model)
 

diff --git a/src/megatron/bridge/models/conversion/transformers_compat.py b/src/megatron/bridge/models/conversion/transformers_compat.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compatibility utilities for HuggingFace transformers 5.0+ configs."""
+
+
+def rope_theta_from_hf(config) -> float:
+    """Extract rope_theta from a HuggingFace config.
+
+    This utility method handles the extraction of rope_theta (rotary position
+    embedding base frequency) from HuggingFace configs, supporting both the
+    legacy format (direct rope_theta attribute) and the new transformers 5.0+
+    format (rope_parameters dictionary).
+
+    Args:
+        config: HuggingFace configuration object.
+
+    Returns:
+        float: The rope_theta value for rotary embeddings.
+
+    Raises:
+        ValueError: If rope_theta is not found in either format.
+    """
+    # Check for direct attribute (transformers <5.0)
+    if hasattr(config, "rope_theta"):
+        rope_theta = config.rope_theta
+        if rope_theta is not None:
+            return rope_theta
+
+    # Check rope_parameters (transformers >=5.0)
+    if hasattr(config, "rope_parameters") and config.rope_parameters:
+        # Flat structure: rope_parameters["rope_theta"]
+        if "rope_theta" in config.rope_parameters:
+            rope_theta = config.rope_parameters["rope_theta"]
+            if rope_theta is not None:
+                return rope_theta
+        # Nested structure for Gemma3 in transformers 5.0+: rope_parameters["global"]["base"]
+        if "global" in config.rope_parameters:
+            global_params = config.rope_parameters["global"]
+            if isinstance(global_params, dict) and "base" in global_params:
+                rope_theta = global_params["base"]
+                if rope_theta is not None:
+                    return rope_theta
+        # Gemma3 transformers 5.0+ uses "full_attention" key with "rope_theta"
+        if "full_attention" in config.rope_parameters:
+            full_attn_params = config.rope_parameters["full_attention"]
+            if isinstance(full_attn_params, dict) and "rope_theta" in full_attn_params:
+                rope_theta = full_attn_params["rope_theta"]
+                if rope_theta is not None:
+                    return rope_theta
+
+    # Fallback to default_theta (transformers 5.0+)
+    if hasattr(config, "default_theta") and config.default_theta:
+        # default_theta can be a plain float (e.g. NemotronH) or a dict (e.g. Gemma3)
+        if isinstance(config.default_theta, (int, float)):
+            return float(config.default_theta)
+        if isinstance(config.default_theta, dict) and "global" in config.default_theta:
+            rope_theta = config.default_theta["global"]
+            if rope_theta is not None:
+                return rope_theta
+
+    raise ValueError(
+        "rope_theta not found in config. Expected either 'rope_theta' attribute "
+        "(transformers <5.0), 'rope_parameters[\"rope_theta\"]', "
+        '\'rope_parameters["global"]["base"]\', \'rope_parameters["full_attention"]["rope_theta"]\', '
+        "or 'default_theta[\"global\"]' (transformers >=5.0)."
+    )
+
+
+def rope_local_base_freq_from_hf(config) -> float:
+    """Extract rope_local_base_freq from a HuggingFace config.
+
+    Similar to rope_theta_from_hf but for the local base frequency parameter
+    used by some models (e.g., Gemma3).
+
+    Args:
+        config: HuggingFace configuration object.
+
+    Returns:
+        float: The rope_local_base_freq value.
+
+    Raises:
+        ValueError: If rope_local_base_freq is not found in either format.
+    """
+    # Check for direct attribute (transformers <5.0)
+    if hasattr(config, "rope_local_base_freq"):
+        rope_local_base_freq = config.rope_local_base_freq
+        if rope_local_base_freq is not None:
+            return rope_local_base_freq
+
+    # Check rope_parameters (transformers >=5.0)
+    if hasattr(config, "rope_parameters") and config.rope_parameters:
+        # Flat structure: rope_parameters["rope_local_base_freq"]
+        if "rope_local_base_freq" in config.rope_parameters:
+            rope_local_base_freq = config.rope_parameters["rope_local_base_freq"]
+            if rope_local_base_freq is not None:
+                return rope_local_base_freq
+        # Nested structure for Gemma3 in transformers 5.0+: rope_parameters["local"]["base"]
+        if "local" in config.rope_parameters:
+            local_params = config.rope_parameters["local"]
+            if isinstance(local_params, dict) and "base" in local_params:
+                rope_local_base_freq = local_params["base"]
+                if rope_local_base_freq is not None:
+                    return rope_local_base_freq
+        # Gemma3 transformers 5.0+ uses "sliding_attention" key with "rope_theta"
+        if "sliding_attention" in config.rope_parameters:
+            sliding_attn_params = config.rope_parameters["sliding_attention"]
+            if isinstance(sliding_attn_params, dict) and "rope_theta" in sliding_attn_params:
+                rope_local_base_freq = sliding_attn_params["rope_theta"]
+                if rope_local_base_freq is not None:
+                    return rope_local_base_freq
+
+    # Check rope_scaling as a fallback
+    if hasattr(config, "rope_scaling") and config.rope_scaling:
+        if "rope_local_base_freq" in config.rope_scaling:
+            rope_local_base_freq = config.rope_scaling["rope_local_base_freq"]
+            if rope_local_base_freq is not None:
+                return rope_local_base_freq
+
+    # Fallback to default_theta (transformers 5.0+)
+    if hasattr(config, "default_theta") and config.default_theta:
+        if isinstance(config.default_theta, dict) and "local" in config.default_theta:
+            rope_local_base_freq = config.default_theta["local"]
+            if rope_local_base_freq is not None:
+                return rope_local_base_freq
+
+    raise ValueError(
+        "rope_local_base_freq not found in config. Expected either 'rope_local_base_freq' attribute "
+        "(transformers <5.0), 'rope_parameters[\"rope_local_base_freq\"]', "
+        '\'rope_parameters["local"]["base"]\', \'rope_parameters["sliding_attention"]["rope_theta"]\', '
+        "'rope_scaling[\"rope_local_base_freq\"]', or 'default_theta[\"local\"]' (transformers >=5.0)."
+    )
+
+
+def rope_scaling_factor_from_hf(config, default: float = 1.0) -> float:
+    """Extract rope scaling factor from a HuggingFace config.
+
+    This utility method handles the extraction of the rope scaling factor from
+    HuggingFace configs, supporting both the legacy format (rope_scaling dict)
+    and the new transformers 5.0+ format (rope_parameters dictionary).
+
+    Args:
+        config: HuggingFace configuration object.
+        default: Default value to return if no scaling factor is found.
+
+    Returns:
+        float: The rope scaling factor value, or default if not found.
+    """
+    # Check rope_scaling (transformers <5.0 and some 5.0+ models)
+    if hasattr(config, "rope_scaling") and config.rope_scaling:
+        if isinstance(config.rope_scaling, dict) and "factor" in config.rope_scaling:
+            factor = config.rope_scaling["factor"]
+            if factor is not None:
+                return factor
+
+    # Check rope_parameters (transformers >=5.0)
+    if hasattr(config, "rope_parameters") and config.rope_parameters:
+        # Check for nested structure with layer types (Gemma3 style)
+        for layer_type in ["full_attention", "global"]:
+            if layer_type in config.rope_parameters:
+                layer_params = config.rope_parameters[layer_type]
+                if isinstance(layer_params, dict) and "factor" in layer_params:
+                    factor = layer_params["factor"]
+                    if factor is not None:
+                        return factor
+        # Check flat structure
+        if "factor" in config.rope_parameters:
+            factor = config.rope_parameters["factor"]
+            if factor is not None:
+                return factor
+
+    # Return default if no scaling factor found
+    return default
diff --git a/src/megatron/bridge/models/deepseek/common.py b/src/megatron/bridge/models/deepseek/common.py
@@ -13,66 +13,6 @@
 # limitations under the License.
 
 from megatron.bridge.models.conversion.param_mapping import AutoMapping, GatedMLPMapping
-from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
-
-
-try:
-    import apex  # noqa: F401
-
-    HAVE_APEX = True
-except ImportError:
-    HAVE_APEX = False
-
-
-def get_common_configs(hf_pretrained: PreTrainedCausalLM) -> dict:
-    """
-    Returns a dictionary of common configurations for the DeepSeek family of models.
-    """
-    hf_config = hf_pretrained.config
-
-    configs = {}
-
-    if not HAVE_APEX:
-        configs["gradient_accumulation_fusion"] = False
-
-    if hasattr(hf_config, "rope_scaling") and hf_config.rope_scaling is not None:
-        configs["rotary_scaling_factor"] = hf_config.rope_scaling["factor"]
-        configs["mscale"] = hf_config.rope_scaling["mscale"]
-        configs["mscale_all_dim"] = hf_config.rope_scaling["mscale_all_dim"]
-    else:
-        configs["rotary_scaling_factor"] = 1.0
-        configs["mscale"] = 1.0
-        configs["mscale_all_dim"] = 1.0
-
-    configs["num_layers"] = hf_config.num_hidden_layers
-    configs["hidden_size"] = hf_config.hidden_size
-    configs["ffn_hidden_size"] = hf_config.intermediate_size
-    configs["num_attention_heads"] = hf_config.num_attention_heads
-    configs["num_query_groups"] = hf_config.num_key_value_heads
-    configs["q_lora_rank"] = hf_config.q_lora_rank
-    configs["num_moe_experts"] = hf_config.n_routed_experts
-    configs["moe_ffn_hidden_size"] = hf_config.moe_intermediate_size
-    configs["moe_shared_expert_intermediate_size"] = hf_config.moe_intermediate_size * hf_config.n_shared_experts
-    configs["moe_layer_freq"] = [0] * hf_config.first_k_dense_replace + [1] * (
-        hf_config.num_hidden_layers - hf_config.first_k_dense_replace
-    )
-    configs["moe_router_topk"] = hf_config.num_experts_per_tok
-    configs["moe_router_num_groups"] = hf_config.n_group
-    configs["moe_router_group_topk"] = hf_config.topk_group
-    configs["moe_router_topk_scaling_factor"] = hf_config.routed_scaling_factor
-    configs["kv_lora_rank"] = hf_config.kv_lora_rank
-    configs["qk_head_dim"] = hf_config.qk_nope_head_dim
-    configs["qk_pos_emb_head_dim"] = hf_config.qk_rope_head_dim
-    configs["v_head_dim"] = hf_config.v_head_dim
-
-    # Ensure MLA is enabled
-    configs["multi_latent_attention"] = True
-    configs["vocab_size"] = hf_config.vocab_size
-    configs["rotary_base"] = hf_config.rope_theta
-    configs["init_method_std"] = hf_config.initializer_range
-    configs["layernorm_epsilon"] = hf_config.rms_norm_eps
-
-    return configs
 
 
 def get_common_mapping_list() -> list:

diff --git a/src/megatron/bridge/models/deepseek/deepseek_v3_bridge.py b/src/megatron/bridge/models/deepseek/deepseek_v3_bridge.py
@@ -22,6 +22,7 @@
 from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
 from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge, WeightConversionTask
 from megatron.bridge.models.conversion.param_mapping import AutoMapping
+from megatron.bridge.models.conversion.transformers_compat import rope_theta_from_hf
 from megatron.bridge.models.deepseek.common import get_common_mapping_list
 from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
 from megatron.bridge.models.mla_provider import MLAModelProvider
@@ -62,6 +63,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MLAModelProvider
         provider.moe_token_dispatcher_type = "alltoall"
         provider.moe_router_load_balancing_type = "seq_aux_loss"
         provider.moe_shared_expert_overlap = True
+        provider.moe_router_score_function = "sigmoid"
         provider.moe_router_enable_expert_bias = True
         provider.moe_router_dtype = "fp32"
         provider.moe_permute_fusion = True
@@ -138,7 +140,7 @@ def maybe_modify_converted_hf_weight(
         inv_freq = getattr(self, "_deepseek_inv_freq", None)
         if inv_freq is None:
             rotary_dim = self.hf_config.qk_rope_head_dim
-            rotary_base = self.hf_config.rope_theta
+            rotary_base = rope_theta_from_hf(self.hf_config)
             inv_freq = 1.0 / (rotary_base ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim))
             self._deepseek_inv_freq = inv_freq