Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/megatron/bridge/models/deepseek/deepseek_v2_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MLAModelProvider
provider.cross_entropy_loss_fusion = True
provider.masked_softmax_fusion = True
provider.persist_layer_norm = True
provider.async_tensor_model_parallel_allreduce = True
provider.gradient_accumulation_fusion = True

provider.hidden_dropout = 0.0
Expand Down
1 change: 0 additions & 1 deletion src/megatron/bridge/models/deepseek/deepseek_v3_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MLAModelProvider
provider.cross_entropy_loss_fusion = True
provider.masked_softmax_fusion = True
provider.persist_layer_norm = True
provider.async_tensor_model_parallel_allreduce = True
provider.gradient_accumulation_fusion = True

provider.hidden_dropout = 0.0
Expand Down
1 change: 0 additions & 1 deletion src/megatron/bridge/models/kimi/kimi_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ class KimiK2Provider(MLATransformerConfig, GPTModelProvider):
layernorm_epsilon: float = 1e-6
bf16: bool = True
params_dtype: torch.dtype = torch.bfloat16
async_tensor_model_parallel_allreduce: bool = True
attention_softmax_in_fp32: bool = False
persist_layer_norm: bool = True
num_layers_in_first_pipeline_stage: Optional[int] = None
Expand Down
1 change: 0 additions & 1 deletion src/megatron/bridge/models/model_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,6 @@ class ModelParallelKwargs(TypedDict, total=False):
context_parallel_size: int
expert_model_parallel_size: int
expert_tensor_parallel_size: int
moe_extended_tp: bool
sequence_parallel: bool
virtual_pipeline_model_parallel_size: int | None
hierarchical_context_parallel_sizes: list[int] | None
Expand Down
1 change: 0 additions & 1 deletion src/megatron/bridge/models/qwen_vl/qwen3_vl_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,6 @@ class Qwen3VLMoEModelProvider(GPTModelProvider):
bias_dropout_fusion: bool = True
masked_softmax_fusion: bool = False # Don't fuse masked softmax (Qwen specific)
deallocate_pipeline_outputs: bool = True
async_tensor_model_parallel_allreduce: bool = True
distribute_saved_activations: bool = False
cp_comm_type: str = "p2p"

Expand Down
2 changes: 0 additions & 2 deletions src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def nemotron_3_nano_pretrain_config() -> ConfigContainer:

cfg.model.init_method_std = 0.0173
cfg.model.apply_rope_fusion = False
cfg.model.async_tensor_model_parallel_allreduce = True
cfg.model.gradient_accumulation_fusion = True
cfg.model.use_fused_weighted_squared_relu = True

Expand Down Expand Up @@ -331,7 +330,6 @@ def _nemotron_3_nano_finetune_common(
expert_tensor_parallel_size=expert_tensor_parallelism,
expert_model_parallel_size=expert_model_parallelism,
apply_rope_fusion=False,
async_tensor_model_parallel_allreduce=True,
attention_backend="fused",
gradient_accumulation_fusion=True,
init_method_std=0.0173,
Expand Down
1 change: 0 additions & 1 deletion src/megatron/bridge/training/model_load_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,6 @@ def load_megatron_model(
model_cfg.context_parallel_size = 1
model_cfg.expert_model_parallel_size = 1
model_cfg.expert_tensor_parallel_size = 1
model_cfg.moe_extended_tp = False
model_cfg.sequence_parallel = False
model_cfg.perform_initialization = False
model_cfg.virtual_pipeline_model_parallel_size = None
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/peft/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def __init__(self):
self.perform_initialization = True
self.use_cpu_initialization = False
self.gradient_accumulation_fusion = False
self.async_tensor_model_parallel_allreduce = False


class MockColumnParallelLinear(ColumnParallelLinear):
Expand Down
3 changes: 0 additions & 3 deletions tests/unit_tests/training/test_model_load_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,6 @@ def test_load_megatron_model_resets_defaults(self, mock_load_model_config, mock_
cfg.context_parallel_size = 2
cfg.expert_model_parallel_size = 2
cfg.expert_tensor_parallel_size = 2
cfg.moe_extended_tp = True
cfg.sequence_parallel = True
cfg.virtual_pipeline_model_parallel_size = 2
cfg.hierarchical_context_parallel_sizes = [2, 2]
Expand All @@ -545,7 +544,6 @@ def test_load_megatron_model_resets_defaults(self, mock_load_model_config, mock_
assert cfg.context_parallel_size == 1
assert cfg.expert_model_parallel_size == 1
assert cfg.expert_tensor_parallel_size == 1
assert cfg.moe_extended_tp is False
assert cfg.sequence_parallel is False
assert cfg.virtual_pipeline_model_parallel_size is None
assert cfg.hierarchical_context_parallel_sizes is None
Expand All @@ -561,7 +559,6 @@ def test_load_megatron_model_applies_overrides(self, mock_load_model_config, moc
cfg.context_parallel_size = 1
cfg.expert_model_parallel_size = 1
cfg.expert_tensor_parallel_size = 1
cfg.moe_extended_tp = False
cfg.sequence_parallel = False
cfg.virtual_pipeline_model_parallel_size = None
cfg.hierarchical_context_parallel_sizes = None
Expand Down
Loading
Loading