NVIDIA-NeMo · pengdurice · Feb 20, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py
@@ -64,6 +64,10 @@
     GLM45ModelProvider355B,
     GLMMoEModelProvider,
 )
+from megatron.bridge.models.glm_moe_dsa import (
+    GLM5Bridge,
+    GLM5ModelProvider,
+)
 from megatron.bridge.models.glm_vl import (
     GLM45VBridge,
     GLM45VModelProvider,
@@ -222,6 +226,8 @@
     "GLM45ModelProvider355B",
     "GLM45AirModelProvider106B",
     "GLM45Bridge",
+    "GLM5Bridge",
+    "GLM5ModelProvider",
     "GLM45VBridge",
     "GLM45VModelProvider",
     "GPTModelProvider",

diff --git a/src/megatron/bridge/models/glm_moe_dsa/__init__.py b/src/megatron/bridge/models/glm_moe_dsa/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.bridge.models.glm_moe_dsa.glm5_bridge import GLM5Bridge
+from megatron.bridge.models.glm_moe_dsa.glm5_provider import (
+    GLM5ModelProvider,
+)
+
+
+__all__ = [
+    "GLM5ModelProvider",
+    "GLM5Bridge",
+]
diff --git a/src/megatron/bridge/models/glm_moe_dsa/glm5_bridge.py b/src/megatron/bridge/models/glm_moe_dsa/glm5_bridge.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import re
+from typing import Dict, Mapping, Optional, Tuple
+
+import torch
+from megatron.core.models.gpt.gpt_model import GPTModel
+from transformers import GlmMoeDsaForCausalLM
+
+from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.conversion.param_mapping import (
+    AutoMapping,
+    GatedMLPMapping,
+    QKVMapping,
+)
+from megatron.bridge.models.glm_moe_dsa.glm5_provider import GLM5ModelProvider
+from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
+
+
+logger = logging.getLogger(__name__)
+
+
+@MegatronModelBridge.register_bridge(source=GlmMoeDsaForCausalLM, target=GPTModel, model_type="glm_moe_dsa")
+class GLM5Bridge(MegatronModelBridge):
+    """
+    Megatron Bridge for GLM 5 Models.
+
+    This bridge handles the conversion between HuggingFace Glm5MoeForCausalLM
+    (used for GLM 5 models) and Megatron-Core GPTModel formats.
+
+    Example:
+        >>> from megatron.bridge import AutoBridge
+        >>> bridge = AutoBridge.from_hf_pretrained("zai-org/GLM-4.5")
+        >>> provider = bridge.to_megatron_provider()
+    """
+
+    @staticmethod
+    def _get_glm5_configs(hf_pretrained: PreTrainedCausalLM) -> dict:
+        """Build provider kwargs from GLM5 HF config schema."""
+        hf_config = hf_pretrained.config
+
+        configs = {
+            "num_layers": hf_config.num_hidden_layers,
+            "hidden_size": hf_config.hidden_size,
+            "ffn_hidden_size": hf_config.intermediate_size,
+            "num_attention_heads": hf_config.num_attention_heads,
+            "num_query_groups": hf_config.num_key_value_heads,
+            "kv_channels": getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads),
+            "q_lora_rank": hf_config.q_lora_rank,
+            "kv_lora_rank": hf_config.kv_lora_rank,
+            "num_moe_experts": hf_config.n_routed_experts,
+            "moe_ffn_hidden_size": hf_config.moe_intermediate_size,
+            "moe_shared_expert_intermediate_size": hf_config.moe_intermediate_size * hf_config.n_shared_experts,
+            "moe_layer_freq": [0] * hf_config.first_k_dense_replace
+            + [1] * (hf_config.num_hidden_layers - hf_config.first_k_dense_replace),
+            "moe_router_topk": hf_config.num_experts_per_tok,
+            "moe_router_num_groups": hf_config.n_group,
+            "moe_router_group_topk": hf_config.topk_group,
+            "moe_router_topk_scaling_factor": hf_config.routed_scaling_factor,
+            # MLA dims in MCore format
+            "qk_head_dim": hf_config.qk_nope_head_dim,
+            "qk_pos_emb_head_dim": hf_config.qk_rope_head_dim,
+            "v_head_dim": hf_config.v_head_dim,
+            "vocab_size": hf_config.vocab_size,
+            "rotary_base": hf_config.rope_parameters["rope_theta"],
+            "init_method_std": hf_config.initializer_range,
+            "layernorm_epsilon": hf_config.rms_norm_eps,
+            "multi_latent_attention": True,
+            # DSA indexer params (v3.2-compatible interface)
+            "experimental_attention_variant": "dsa",
+            "dsa_indexer_head_dim": hf_config.index_head_dim,
+            "dsa_indexer_n_heads": hf_config.index_n_heads,
+            "dsa_indexer_topk": hf_config.index_topk,
+            "dsa_indexer_loss_coeff": 0.001,
+            "dsa_indexer_use_sparse_loss": True,
+            # MTP params
+            "mtp_loss_scaling_factor": 0.1,
+            # GLM5 uses default rope parameters (not yarn rope_scaling)
+            "rotary_scaling_factor": 1.0,
+            "mscale": 1.0,
+            "mscale_all_dim": 1.0,
+            "add_bias_linear": False,
+            "position_embedding_type": "rope",
+            "normalization": "RMSNorm",
+        }
+
+        return configs
+
+    def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GLM5ModelProvider:
+        hf_config = hf_pretrained.config
+        configs = self._get_glm5_configs(hf_pretrained)
+
+        configs["fp16"] = self.dtype_from_hf(hf_config, default=torch.float32) == torch.float16
+        configs["bf16"] = self.dtype_from_hf(hf_config, default=torch.float32) == torch.bfloat16
+        configs["params_dtype"] = self.dtype_from_hf(hf_config, default=torch.float32)
+
+        configs["make_vocab_size_divisible_by"] = 1280
+        configs["moe_router_score_function"] = "sigmoid"
+        configs["moe_router_enable_expert_bias"] = False
+        if hasattr(hf_config, "aux_loss_alpha"):
+            configs["moe_aux_loss_coeff"] = hf_config.aux_loss_alpha
+
+        provider = GLM5ModelProvider(**configs)
+        # Use experimental-attention spec for DSA
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_transformer_block_with_experimental_attention_variant_spec,
+        )
+        provider.transformer_layer_spec = (
+            get_transformer_block_with_experimental_attention_variant_spec
+        )
+        provider.normalization = "RMSNorm"
+        provider.gated_linear_unit = True
+        provider.position_embedding_type = "rope"
+        provider.add_bias_linear = False
+        provider.share_embeddings_and_output_weights = False
+        provider.qk_layernorm = True
+        provider.multi_latent_attention = True
+        provider.moe_grouped_gemm = True
+        provider.moe_router_pre_softmax = True
+        provider.moe_token_dispatcher_type = "alltoall"
+        provider.moe_router_load_balancing_type = "seq_aux_loss"
+        provider.moe_shared_expert_overlap = True
+        provider.moe_router_dtype = "fp32"
+        provider.moe_permute_fusion = True
+        provider.hidden_dropout = 0.0
+        provider.attention_softmax_in_fp32 = False
+        provider.make_vocab_size_divisible_by = 1280
+        return provider
+
+    def mapping_registry(self) -> MegatronMappingRegistry:
+        mapping_list = []
+
+        param_mappings = {
+            # Embed
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            # Attention
+            "decoder.layers.*.input_layernorm.weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            # Reference: https://github.com/NVIDIA/NeMo/blob/50cceb9c90ea1f440d1e14074fa13bd45f60a1c4/nemo/collections/llm/gpt/model/deepseek.py#L637-L650
+            #  In deepseek, HF weight `model.layers.*.post_attention_layernorm.weight` is mapped to the following mcore weights depending on the layer type:
+            #  (a) `decoder.layers.*.pre_mlp_layernorm.weight`, if the layer is MoE
+            #  (b) `decoder.layers.*.mlp.linear_fc1.layer_norm_weight`, if the layer is dense
+            "decoder.layers.*.pre_mlp_layernorm.weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.layers.*.self_attention.linear_kv_down_proj.weight": "model.layers.*.self_attn.kv_a_proj_with_mqa.weight",
+            "decoder.layers.*.self_attention.linear_kv_up_proj.weight": "model.layers.*.self_attn.kv_b_proj.weight",
+            "decoder.layers.*.self_attention.linear_kv_up_proj.layer_norm_weight": "model.layers.*.self_attn.kv_a_layernorm.weight",
+            # Mcore local spec
+            "decoder.layers.*.self_attention.kv_layernorm.weight": "model.layers.*.self_attn.kv_a_layernorm.weight",
+            # Dense MLP
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            # MoE
+            "decoder.layers.*.mlp.router.weight": "model.layers.*.mlp.gate.weight",
+            "decoder.layers.*.mlp.experts.linear_fc2.weight*": "model.layers.*.mlp.experts.*.down_proj.weight",
+            "decoder.layers.*.mlp.shared_experts.linear_fc2.weight": "model.layers.*.mlp.shared_experts.down_proj.weight",
+            # LM Head
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+            # MLA
+            "decoder.layers.*.self_attention.linear_q_down_proj.weight": "model.layers.*.self_attn.q_a_proj.weight",
+            "decoder.layers.*.self_attention.linear_q_up_proj.weight": "model.layers.*.self_attn.q_b_proj.weight",
+            "decoder.layers.*.self_attention.linear_q_up_proj.layer_norm_weight": "model.layers.*.self_attn.q_a_layernorm.weight",
+            # Mcore local spec
+            "decoder.layers.*.self_attention.q_layernorm.weight": "model.layers.*.self_attn.q_a_layernorm.weight",
+            # For models without MLA
+            "decoder.layers.*.self_attention.linear_q_proj.weight": "model.layers.*.self_attn.q_proj.weight",
+            # Sparse attention indexer
+            "decoder.layers.*.self_attention.core_attention.indexer.linear_wq_b.weight": "model.layers.*.self_attn.indexer.wq_b.weight",
+            "decoder.layers.*.self_attention.core_attention.indexer.linear_wk.weight": "model.layers.*.self_attn.indexer.wk.weight",
+            "decoder.layers.*.self_attention.core_attention.indexer.k_norm.weight": "model.layers.*.self_attn.indexer.k_norm.weight",
+            "decoder.layers.*.self_attention.core_attention.indexer.k_norm.bias": "model.layers.*.self_attn.indexer.k_norm.bias",
+            "decoder.layers.*.self_attention.core_attention.indexer.linear_weights_proj.weight": "model.layers.*.self_attn.indexer.weights_proj.weight",
+        }
+        layer_specific_mappings = {
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.shared_experts.router.weight": "model.layers.*.mlp.shared_experts.gate.weight",
+            "decoder.layers.*.mlp.router.expert_bias": "model.layers.*.mlp.gate.e_score_correction_bias",
+        }
+
+        for megatron_param, hf_param in param_mappings.items():
+            mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param))
+
+        for megatron_param, hf_param in layer_specific_mappings.items():
+            mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param))
+
+        # Add special mappings that require parameter concatenation/transformation
+        mapping_list.extend(
+            [
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    megatron_param="decoder.layers.*.self_attention.linear_qkv.weight",
+                    q="model.layers.*.self_attn.q_proj.weight",
+                    k="model.layers.*.self_attn.k_proj.weight",
+                    v="model.layers.*.self_attn.v_proj.weight",
+                ),
+                QKVMapping(
+                    megatron_param="decoder.layers.*.self_attention.linear_qkv.bias",
+                    q="model.layers.*.self_attn.q_proj.bias",
+                    k="model.layers.*.self_attn.k_proj.bias",
+                    v="model.layers.*.self_attn.v_proj.bias",
+                ),
+                # Gated MLP: Combine gate and up projection matrices into single FC1 matrix
+                GatedMLPMapping(
+                    megatron_param="decoder.layers.*.mlp.linear_fc1.weight",
+                    gate="model.layers.*.mlp.gate_proj.weight",
+                    up="model.layers.*.mlp.up_proj.weight",
+                ),
+                GatedMLPMapping(
+                    megatron_param="decoder.layers.*.mlp.shared_experts.linear_fc1.weight",
+                    gate="model.layers.*.mlp.shared_experts.gate_proj.weight",
+                    up="model.layers.*.mlp.shared_experts.up_proj.weight",
+                ),
+                GatedMLPMapping(
+                    megatron_param="decoder.layers.*.mlp.experts.linear_fc1.weight*",
+                    gate="model.layers.*.mlp.experts.*.gate_proj.weight",
+                    up="model.layers.*.mlp.experts.*.up_proj.weight",
+                ),
+            ]
+        )
+        hf_config = self.hf_config
+        num_mtp_layers = getattr(hf_config, "num_nextn_predict_layers", 0)
+        num_transformer_layers = hf_config.num_hidden_layers
+        for mtp_layer in range(num_mtp_layers):
+            # MTP specific mappings
+            mapping_list.extend(
+                [
+                    AutoMapping(
+                        megatron_param=f"mtp.layers.{mtp_layer}.enorm.weight",
+                        hf_param=f"model.layers.{mtp_layer + num_transformer_layers}.enorm.weight",
+                    ),
+                    AutoMapping(
+                        megatron_param=f"mtp.layers.{mtp_layer}.hnorm.weight",
+                        hf_param=f"model.layers.{mtp_layer + num_transformer_layers}.hnorm.weight",
+                    ),
+                    AutoMapping(
+                        megatron_param=f"mtp.layers.{mtp_layer}.eh_proj.weight",
+                        hf_param=f"model.layers.{mtp_layer + num_transformer_layers}.eh_proj.weight",
+                    ),
+                    AutoMapping(
+                        megatron_param=f"mtp.layers.{mtp_layer}.final_layernorm.weight",
+                        hf_param=f"model.layers.{mtp_layer + num_transformer_layers}.shared_head.norm.weight",
+                    ),
+                ]
+            )
+
+            for layer_prefix in ("transformer_layer", "mtp_model_layer"):
+                for megatron_param, hf_param in (param_mappings | layer_specific_mappings).items():
+                    megatron_param = (
+                        megatron_param.replace(".*", f".*.{layer_prefix}")
+                        .replace("decoder", "mtp")
+                        .replace(".*", f".{mtp_layer}")
+                    )
+                    hf_param = hf_param.replace("layers.*", f"layers.{mtp_layer + num_transformer_layers}")
+                    mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param))
+                # Special mappings that require parameter concatenation/transformation
+                mapping_list.extend(
+                    [
+                        QKVMapping(
+                            megatron_param=f"mtp.layers.{mtp_layer}.{layer_prefix}.self_attention.linear_qkv.weight",
+                            q=f"model.layers.{mtp_layer + num_transformer_layers}.self_attn.q_proj.weight",
+                            k=f"model.layers.{mtp_layer + num_transformer_layers}.self_attn.k_proj.weight",
+                            v=f"model.layers.{mtp_layer + num_transformer_layers}.self_attn.v_proj.weight",
+                        ),
+                        QKVMapping(
+                            megatron_param=f"mtp.layers.{mtp_layer}.{layer_prefix}.self_attention.linear_qkv.bias",
+                            q=f"model.layers.{mtp_layer + num_transformer_layers}.self_attn.q_proj.bias",
+                            k=f"model.layers.{mtp_layer + num_transformer_layers}.self_attn.k_proj.bias",
+                            v=f"model.layers.{mtp_layer + num_transformer_layers}.self_attn.v_proj.bias",
+                        ),
+                        GatedMLPMapping(
+                            megatron_param=f"mtp.layers.{mtp_layer}.{layer_prefix}.mlp.linear_fc1.weight",
+                            gate=f"model.layers.{mtp_layer + num_transformer_layers}.mlp.gate_proj.weight",
+                            up=f"model.layers.{mtp_layer + num_transformer_layers}.mlp.up_proj.weight",
+                        ),
+                        GatedMLPMapping(
+                            megatron_param=f"mtp.layers.{mtp_layer}.{layer_prefix}.mlp.shared_experts.linear_fc1.weight",
+                            gate=f"model.layers.{mtp_layer + num_transformer_layers}.mlp.shared_experts.gate_proj.weight",
+                            up=f"model.layers.{mtp_layer + num_transformer_layers}.mlp.shared_experts.up_proj.weight",
+                        ),
+                        GatedMLPMapping(
+                            megatron_param=f"mtp.layers.{mtp_layer}.{layer_prefix}.mlp.experts.linear_fc1.weight*",
+                            gate=f"model.layers.{mtp_layer + num_transformer_layers}.mlp.experts.*.gate_proj.weight",
+                            up=f"model.layers.{mtp_layer + num_transformer_layers}.mlp.experts.*.up_proj.weight",
+                        ),
+                    ]
+                )
+
+        return MegatronMappingRegistry(*mapping_list)
+
diff --git a/src/megatron/bridge/models/glm_moe_dsa/glm5_provider.py b/src/megatron/bridge/models/glm_moe_dsa/glm5_provider.py
diff --git a/tests/functional_tests/models/glm_moe_dsa/__init__.py b/tests/functional_tests/models/glm_moe_dsa/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.