Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
8ccb17a
transformers upgrade
Jan 26, 2026
022fcef
transformers upgrade
Jan 26, 2026
588d3bb
fix rope theta and other misc issues
yaoyu-33 Jan 26, 2026
81ad475
fix glm 45 functional test
yaoyu-33 Jan 26, 2026
c84fee9
Merge branch 'main' into chore/transformers_5p0
yaoyu-33 Jan 30, 2026
caf33c0
vlm fixes
yaoyu-33 Jan 30, 2026
fa5ee53
gemma fix
yaoyu-33 Jan 30, 2026
a6c5813
fix glm
yaoyu-33 Jan 30, 2026
f4c5c69
Merge branch 'main' into chore/transformers_5p0
yaoyu-33 Feb 2, 2026
aa0536c
fix unit tests
yaoyu-33 Feb 2, 2026
fbb9b96
Merge branch 'main' into chore/transformers_5p0
yaoyu-33 Feb 4, 2026
798bcbb
Merge branch 'main' into chore/transformers_5p0
yaoyu-33 Feb 11, 2026
d6d9aac
Merge remote-tracking branch 'origin/chore/transformers_5p0' into cho…
yaoyu-33 Feb 11, 2026
77e5633
fix deepseek and gemma3
yaoyu-33 Feb 12, 2026
04060d9
glm fix
yaoyu-33 Feb 12, 2026
1ca98f9
Merge branch 'main' into chore/transformers_5p0
yaoyu-33 Feb 18, 2026
34c55cc
update uv.lock
yaoyu-33 Feb 18, 2026
a9e73a5
fix nemotronh and qwen3moe
yaoyu-33 Feb 19, 2026
96c7291
fix vlm models
yaoyu-33 Feb 20, 2026
fcf858e
Merge branch 'chore/transformers_5p0' of github.com:NVIDIA-NeMo/Megat…
yaoyu-33 Feb 20, 2026
437a2ad
fix: handle transformers 5.0 rope_theta migration to rope_parameters
yaoyu-33 Feb 20, 2026
2414d37
Merge branch 'main' into chore/transformers_5p0
yaoyu-33 Feb 20, 2026
5a99c0e
revert unwanted change
yaoyu-33 Feb 20, 2026
b5f6253
refactor: use rope compat functions as direct imports instead of stat…
yaoyu-33 Feb 20, 2026
2e6bbe8
fix: update tests for transformers 5.0 compatibility
yaoyu-33 Feb 20, 2026
5b9cea4
Merge remote-tracking branch 'origin/main' into chore/transformers_5p0
yaoyu-33 Feb 20, 2026
05852e1
fix: handle nested text_config in Qwen2.5 VL conversion test for tran…
yaoyu-33 Feb 23, 2026
ae12a45
fix: handle rope_parameters for transformers 5.0+ compatibility
yaoyu-33 Feb 23, 2026
432173f
fix: handle both pre-5.0 and 5.0+ HF expert weight layouts in Qwen3 V…
yaoyu-33 Feb 23, 2026
5ec9171
fix: update nemotron conversion tests for transformers 5.0 compatibility
yaoyu-33 Feb 23, 2026
494b4b1
Merge remote-tracking branch 'origin/main' into chore/transformers_5p0
yaoyu-33 Feb 23, 2026
c3c8be7
update uv.lock
Feb 23, 2026
d69cd23
Merge branch 'main' into chore/transformers_5p0
yaoyu-33 Feb 25, 2026
5388a3c
update uv lock
yaoyu-33 Feb 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion examples/conversion/hf_megatron_roundtrip_multi_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@
HF_MODEL_ID = "meta-llama/Llama-3.2-1B"
console = Console()

# Parameters where Megatron and HF may use different dtypes.
# These are compared in float32 to avoid false mismatches.
IGNORE_PRECISION_PARAMS = [
"e_score_correction_bias",
]


@torchrun_main
def main(
Expand Down Expand Up @@ -155,8 +161,15 @@ def main(
for name, param in bridge.export_hf_weights(megatron_model, show_progress=False):
if is_rank_0:
original_param = bridge.hf_pretrained.state[name]
compare_param = param
compare_original = original_param
# Cast to float32 for params with known dtype mismatches between Megatron and HF
# (e.g. Megatron keeps expert_bias in float32 while HF may use bfloat16)
if any(p in name for p in IGNORE_PRECISION_PARAMS):
compare_param = param.float()
compare_original = original_param.float()
match = torch.allclose(
param, original_param.to(param.device), atol=1e-1
compare_param, compare_original.to(compare_param.device), atol=1e-1
) # Increased tolerance for bfloat16
all_match = all_match and match
table.add_row(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ classifiers = [
"Topic :: Utilities",
]
dependencies = [
"transformers<5.0.0",
"transformers>=5.0.0",
"datasets>=2.20.0",
"accelerate",
"omegaconf>=2.3.0",
Expand Down
25 changes: 18 additions & 7 deletions src/megatron/bridge/models/conversion/model_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
MegatronParamMapping,
)
from megatron.bridge.models.conversion.peft_bridge import AdapterWeightConversionTask, MegatronPeftBridge
from megatron.bridge.models.conversion.transformers_compat import (
rope_theta_from_hf,
)
from megatron.bridge.models.conversion.utils import (
extract_sort_key,
get_module_and_param_from_name,
Expand Down Expand Up @@ -364,6 +367,14 @@ def hf_config_to_provider_kwargs(self, hf_config) -> dict:
if value is not None:
provider_kwargs[megatron_name] = value

# Extract rotary_base via compat function (handles both legacy rope_theta
# attribute and transformers 5.0+ rope_parameters dict)
if "rotary_base" not in provider_kwargs:
try:
provider_kwargs["rotary_base"] = rope_theta_from_hf(hf_config)
except ValueError:
pass

# Handle rope scaling: extract params from rope_scaling dict
# HF configs use either "type" or "rope_type" key for the scaling type
from megatron.bridge.models.mla_provider import MLAModelProvider
Expand Down Expand Up @@ -945,7 +956,7 @@ def stream_weights_megatron_to_hf(
megatron_to_hf_tasks = conversion_tasks
unwrapped_model = unwrap_model(megatron_model)[0]
model_config = unwrapped_model.config
embeddings_are_tied = self._share_embeddings_and_output_weights(model_config, unwrapped_model)
embeddings_are_tied = self._share_embeddings_and_output_weights(model_config)

hf_state_dict: Mapping[str, torch.Tensor] = hf_pretrained.state if hasattr(hf_pretrained, "state") else {}

Expand Down Expand Up @@ -1107,11 +1118,11 @@ def _get_provider_from_model(self, model: MegatronModule) -> ModelProviderTarget
return model.config

def _share_embeddings_and_output_weights(
self, model_config: TransformerConfig, model: Optional[MegatronModule]
self,
model_config: TransformerConfig,
) -> bool:
"""Fallback-aware accessor for shared embedding setting."""
fallback = getattr(model, "share_embeddings_and_output_weights", False) if model else False
return getattr(model_config, "share_embeddings_and_output_weights", fallback)
"""Shared embedding setting."""
return getattr(model_config, "share_embeddings_and_output_weights")

def _unwrap_name(self, name: str) -> str:
"""Unwrap name from DDP or other wrappers.
Expand Down Expand Up @@ -1147,7 +1158,7 @@ def _broadcast_shared_embeddings(self, megatron_model: Union[MegatronModel, List
if hasattr(unwrapped_model, "language_model") and unwrapped_model.language_model is not None:
unwrapped_model = unwrapped_model.language_model
model_config = unwrapped_model.config
share_embeddings = self._share_embeddings_and_output_weights(model_config, unwrapped_model)
share_embeddings = self._share_embeddings_and_output_weights(model_config)

# TODO(yuya): Fix for VPP, the vp stage needs to be passed in for stage checks
if (share_embeddings and model_config.pipeline_model_parallel_size > 1) and (
Expand Down Expand Up @@ -1190,7 +1201,7 @@ def build_conversion_tasks(
mapping_registry = self.mapping_registry()
unwrapped_model = unwrap_model(megatron_model)[0]
model_config = unwrapped_model.config
embeddings_are_tied = self._share_embeddings_and_output_weights(model_config, unwrapped_model)
embeddings_are_tied = self._share_embeddings_and_output_weights(model_config)
pp_rank = parallel_state.get_pipeline_model_parallel_rank()
sorted_global_param_names_all_pp_ranks = self._megatron_global_param_names_all_pp_ranks(megatron_model)

Expand Down
184 changes: 184 additions & 0 deletions src/megatron/bridge/models/conversion/transformers_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Compatibility utilities for HuggingFace transformers 5.0+ configs."""


def rope_theta_from_hf(config) -> float:
"""Extract rope_theta from a HuggingFace config.

This utility method handles the extraction of rope_theta (rotary position
embedding base frequency) from HuggingFace configs, supporting both the
legacy format (direct rope_theta attribute) and the new transformers 5.0+
format (rope_parameters dictionary).

Args:
config: HuggingFace configuration object.

Returns:
float: The rope_theta value for rotary embeddings.

Raises:
ValueError: If rope_theta is not found in either format.
"""
# Check for direct attribute (transformers <5.0)
if hasattr(config, "rope_theta"):
rope_theta = config.rope_theta
if rope_theta is not None:
return rope_theta

# Check rope_parameters (transformers >=5.0)
if hasattr(config, "rope_parameters") and config.rope_parameters:
# Flat structure: rope_parameters["rope_theta"]
if "rope_theta" in config.rope_parameters:
rope_theta = config.rope_parameters["rope_theta"]
if rope_theta is not None:
return rope_theta
# Nested structure for Gemma3 in transformers 5.0+: rope_parameters["global"]["base"]
if "global" in config.rope_parameters:
global_params = config.rope_parameters["global"]
if isinstance(global_params, dict) and "base" in global_params:
rope_theta = global_params["base"]
if rope_theta is not None:
return rope_theta
# Gemma3 transformers 5.0+ uses "full_attention" key with "rope_theta"
if "full_attention" in config.rope_parameters:
full_attn_params = config.rope_parameters["full_attention"]
if isinstance(full_attn_params, dict) and "rope_theta" in full_attn_params:
rope_theta = full_attn_params["rope_theta"]
if rope_theta is not None:
return rope_theta

# Fallback to default_theta (transformers 5.0+)
if hasattr(config, "default_theta") and config.default_theta:
# default_theta can be a plain float (e.g. NemotronH) or a dict (e.g. Gemma3)
if isinstance(config.default_theta, (int, float)):
return float(config.default_theta)
if isinstance(config.default_theta, dict) and "global" in config.default_theta:
rope_theta = config.default_theta["global"]
if rope_theta is not None:
return rope_theta

raise ValueError(
"rope_theta not found in config. Expected either 'rope_theta' attribute "
"(transformers <5.0), 'rope_parameters[\"rope_theta\"]', "
'\'rope_parameters["global"]["base"]\', \'rope_parameters["full_attention"]["rope_theta"]\', '
"or 'default_theta[\"global\"]' (transformers >=5.0)."
)


def rope_local_base_freq_from_hf(config) -> float:
"""Extract rope_local_base_freq from a HuggingFace config.

Similar to rope_theta_from_hf but for the local base frequency parameter
used by some models (e.g., Gemma3).

Args:
config: HuggingFace configuration object.

Returns:
float: The rope_local_base_freq value.

Raises:
ValueError: If rope_local_base_freq is not found in either format.
"""
# Check for direct attribute (transformers <5.0)
if hasattr(config, "rope_local_base_freq"):
rope_local_base_freq = config.rope_local_base_freq
if rope_local_base_freq is not None:
return rope_local_base_freq

# Check rope_parameters (transformers >=5.0)
if hasattr(config, "rope_parameters") and config.rope_parameters:
# Flat structure: rope_parameters["rope_local_base_freq"]
if "rope_local_base_freq" in config.rope_parameters:
rope_local_base_freq = config.rope_parameters["rope_local_base_freq"]
if rope_local_base_freq is not None:
return rope_local_base_freq
# Nested structure for Gemma3 in transformers 5.0+: rope_parameters["local"]["base"]
if "local" in config.rope_parameters:
local_params = config.rope_parameters["local"]
if isinstance(local_params, dict) and "base" in local_params:
rope_local_base_freq = local_params["base"]
if rope_local_base_freq is not None:
return rope_local_base_freq
# Gemma3 transformers 5.0+ uses "sliding_attention" key with "rope_theta"
if "sliding_attention" in config.rope_parameters:
sliding_attn_params = config.rope_parameters["sliding_attention"]
if isinstance(sliding_attn_params, dict) and "rope_theta" in sliding_attn_params:
rope_local_base_freq = sliding_attn_params["rope_theta"]
if rope_local_base_freq is not None:
return rope_local_base_freq

# Check rope_scaling as a fallback
if hasattr(config, "rope_scaling") and config.rope_scaling:
if "rope_local_base_freq" in config.rope_scaling:
rope_local_base_freq = config.rope_scaling["rope_local_base_freq"]
if rope_local_base_freq is not None:
return rope_local_base_freq

# Fallback to default_theta (transformers 5.0+)
if hasattr(config, "default_theta") and config.default_theta:
if isinstance(config.default_theta, dict) and "local" in config.default_theta:
rope_local_base_freq = config.default_theta["local"]
if rope_local_base_freq is not None:
return rope_local_base_freq

raise ValueError(
"rope_local_base_freq not found in config. Expected either 'rope_local_base_freq' attribute "
"(transformers <5.0), 'rope_parameters[\"rope_local_base_freq\"]', "
'\'rope_parameters["local"]["base"]\', \'rope_parameters["sliding_attention"]["rope_theta"]\', '
"'rope_scaling[\"rope_local_base_freq\"]', or 'default_theta[\"local\"]' (transformers >=5.0)."
)


def rope_scaling_factor_from_hf(config, default: float = 1.0) -> float:
"""Extract rope scaling factor from a HuggingFace config.

This utility method handles the extraction of the rope scaling factor from
HuggingFace configs, supporting both the legacy format (rope_scaling dict)
and the new transformers 5.0+ format (rope_parameters dictionary).

Args:
config: HuggingFace configuration object.
default: Default value to return if no scaling factor is found.

Returns:
float: The rope scaling factor value, or default if not found.
"""
# Check rope_scaling (transformers <5.0 and some 5.0+ models)
if hasattr(config, "rope_scaling") and config.rope_scaling:
if isinstance(config.rope_scaling, dict) and "factor" in config.rope_scaling:
factor = config.rope_scaling["factor"]
if factor is not None:
return factor

# Check rope_parameters (transformers >=5.0)
if hasattr(config, "rope_parameters") and config.rope_parameters:
# Check for nested structure with layer types (Gemma3 style)
for layer_type in ["full_attention", "global"]:
if layer_type in config.rope_parameters:
layer_params = config.rope_parameters[layer_type]
if isinstance(layer_params, dict) and "factor" in layer_params:
factor = layer_params["factor"]
if factor is not None:
return factor
# Check flat structure
if "factor" in config.rope_parameters:
factor = config.rope_parameters["factor"]
if factor is not None:
return factor

# Return default if no scaling factor found
return default
60 changes: 0 additions & 60 deletions src/megatron/bridge/models/deepseek/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,66 +13,6 @@
# limitations under the License.

from megatron.bridge.models.conversion.param_mapping import AutoMapping, GatedMLPMapping
from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM


try:
import apex # noqa: F401

HAVE_APEX = True
except ImportError:
HAVE_APEX = False


def get_common_configs(hf_pretrained: PreTrainedCausalLM) -> dict:
"""
Returns a dictionary of common configurations for the DeepSeek family of models.
"""
hf_config = hf_pretrained.config

configs = {}

if not HAVE_APEX:
configs["gradient_accumulation_fusion"] = False

if hasattr(hf_config, "rope_scaling") and hf_config.rope_scaling is not None:
configs["rotary_scaling_factor"] = hf_config.rope_scaling["factor"]
configs["mscale"] = hf_config.rope_scaling["mscale"]
configs["mscale_all_dim"] = hf_config.rope_scaling["mscale_all_dim"]
else:
configs["rotary_scaling_factor"] = 1.0
configs["mscale"] = 1.0
configs["mscale_all_dim"] = 1.0

configs["num_layers"] = hf_config.num_hidden_layers
configs["hidden_size"] = hf_config.hidden_size
configs["ffn_hidden_size"] = hf_config.intermediate_size
configs["num_attention_heads"] = hf_config.num_attention_heads
configs["num_query_groups"] = hf_config.num_key_value_heads
configs["q_lora_rank"] = hf_config.q_lora_rank
configs["num_moe_experts"] = hf_config.n_routed_experts
configs["moe_ffn_hidden_size"] = hf_config.moe_intermediate_size
configs["moe_shared_expert_intermediate_size"] = hf_config.moe_intermediate_size * hf_config.n_shared_experts
configs["moe_layer_freq"] = [0] * hf_config.first_k_dense_replace + [1] * (
hf_config.num_hidden_layers - hf_config.first_k_dense_replace
)
configs["moe_router_topk"] = hf_config.num_experts_per_tok
configs["moe_router_num_groups"] = hf_config.n_group
configs["moe_router_group_topk"] = hf_config.topk_group
configs["moe_router_topk_scaling_factor"] = hf_config.routed_scaling_factor
configs["kv_lora_rank"] = hf_config.kv_lora_rank
configs["qk_head_dim"] = hf_config.qk_nope_head_dim
configs["qk_pos_emb_head_dim"] = hf_config.qk_rope_head_dim
configs["v_head_dim"] = hf_config.v_head_dim

# Ensure MLA is enabled
configs["multi_latent_attention"] = True
configs["vocab_size"] = hf_config.vocab_size
configs["rotary_base"] = hf_config.rope_theta
configs["init_method_std"] = hf_config.initializer_range
configs["layernorm_epsilon"] = hf_config.rms_norm_eps

return configs


def get_common_mapping_list() -> list:
Expand Down
4 changes: 3 additions & 1 deletion src/megatron/bridge/models/deepseek/deepseek_v3_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge, WeightConversionTask
from megatron.bridge.models.conversion.param_mapping import AutoMapping
from megatron.bridge.models.conversion.transformers_compat import rope_theta_from_hf
from megatron.bridge.models.deepseek.common import get_common_mapping_list
from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
from megatron.bridge.models.mla_provider import MLAModelProvider
Expand Down Expand Up @@ -62,6 +63,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MLAModelProvider
provider.moe_token_dispatcher_type = "alltoall"
provider.moe_router_load_balancing_type = "seq_aux_loss"
provider.moe_shared_expert_overlap = True
provider.moe_router_score_function = "sigmoid"
provider.moe_router_enable_expert_bias = True
provider.moe_router_dtype = "fp32"
provider.moe_permute_fusion = True
Expand Down Expand Up @@ -138,7 +140,7 @@ def maybe_modify_converted_hf_weight(
inv_freq = getattr(self, "_deepseek_inv_freq", None)
if inv_freq is None:
rotary_dim = self.hf_config.qk_rope_head_dim
rotary_base = self.hf_config.rope_theta
rotary_base = rope_theta_from_hf(self.hf_config)
inv_freq = 1.0 / (rotary_base ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim))
self._deepseek_inv_freq = inv_freq

Expand Down
Loading
Loading