Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/gemma/gemma2.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def _gemma2_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: Optional[str] = None,
peft: Union[str, PEFT, None] = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 100,
global_batch_size: Optional[int] = None,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/gemma/gemma3.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def _gemma3_finetune_common(
name: str = "default",
# Finetuning-specific
pretrained_checkpoint: str | None = None,
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training hyperparameters
train_iters: int = 100,
global_batch_size: int | None = None,
Expand Down
4 changes: 3 additions & 1 deletion src/megatron/bridge/recipes/glm/glm45.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ def glm45_355b_finetune_config(**user_kwargs: Unpack[GLM45FinetuneKwargs]) -> Co
"expert_model_parallel_size": 16 if is_full_sft else 4,
"peft": peft_value,
"finetune_lr": 5e-6 if is_full_sft else 1e-4,
"packed_sequence": False, # Packed sequence is not supported for GLM 4.5
}
kwargs: GLM45FinetuneKwargs = {**recommended, **user_kwargs}
return _glm45_finetune_common(**kwargs)
Expand All @@ -391,6 +392,7 @@ def glm45_air_106b_finetune_config(**user_kwargs: Unpack[GLM45FinetuneKwargs]) -
"expert_model_parallel_size": 8 if is_full_sft else 4,
"peft": peft_value,
"finetune_lr": 5e-6 if is_full_sft else 1e-4,
"packed_sequence": False, # Packed sequence is not supported for GLM 4.5
}
kwargs: GLM45FinetuneKwargs = {**recommended, **user_kwargs}
return _glm45_finetune_common(**kwargs)
Expand All @@ -412,7 +414,7 @@ def _glm45_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: Optional[str] = None,
peft: Optional[Union[str, PEFT]] = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int = 128,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/gpt_oss/gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def _gpt_oss_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: Optional[str] = None,
peft: Optional[Union[str, PEFT]] = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int = 128,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/llama/llama3.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,7 +922,7 @@ def _llama3_finetune_common(
name: str = "default",
# Finetuning-specific params
pretrained_checkpoint: str | None = None,
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int | None = None,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/moonlight/moonlight_16b.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def _moonlight_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: Optional[str] = None,
peft: Optional[Union[str, PEFT]] = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int = 128,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def _nemotron_3_nano_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: Optional[str] = None,
peft: Optional[Union[str, PEFT]] = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int = 128,
Expand Down
4 changes: 2 additions & 2 deletions src/megatron/bridge/recipes/nemotronh/nemotronh.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ def _nemotronh_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: str | None = None,
peft: str | PEFT | None = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int = 128,
Expand Down Expand Up @@ -508,7 +508,7 @@ def _nemotronh_finetune_common(
sequence_parallelism: Whether to use sequence parallelism.
pretrained_checkpoint: Path to pretrained checkpoint to load from.
peft: PEFT configuration (e.g., "lora", "dora") or PEFT object. None for full SFT. Default: "lora".
packed_sequence: Whether to use packed sequences. Default: False.
packed_sequence: Whether to use packed sequences. Default: True.
train_iters: Total number of training iterations. Default: 1000.
global_batch_size: Global batch size. Default: 128.
micro_batch_size: Micro batch size. Default: 1.
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/olmoe/olmoe_7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def _olmoe_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: Optional[str] = None,
peft: Optional[Union[str, PEFT]] = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int = 128,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/qwen/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ def _qwen2_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: Optional[str] = None,
peft: Union[str, PEFT, None] = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 100,
global_batch_size: Optional[int] = None,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/qwen/qwen3.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def _qwen3_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: str | None = None,
peft: str | PEFT | None = "lora",
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int | None = None, # Auto-select based on packed_sequence if None
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/qwen/qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def _qwen3_moe_finetune_common(
name: str = "default",
# Finetuning-specific
pretrained_checkpoint: Optional[str] = None,
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training hyperparameters
train_iters: int = 100,
global_batch_size: Optional[int] = None,
Expand Down
3 changes: 2 additions & 1 deletion src/megatron/bridge/recipes/qwen/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ def qwen3_next_80b_a3b_finetune_config(**user_kwargs: Unpack[Qwen3NextFinetuneKw
"finetune_lr": 5e-6,
"min_lr": 5e-6,
"enable_recompute": True,
"packed_sequence": False, # Sequence packing is not supported for Qwen3-Next
}
combined_kwargs: Qwen3NextFinetuneKwargs = {**recommended_kwargs, **user_kwargs}
config = _qwen3_next_finetune_common(**combined_kwargs)
Expand Down Expand Up @@ -405,7 +406,7 @@ def _qwen3_next_finetune_common(
# Finetuning-specific params
pretrained_checkpoint: str | None = None,
peft: str | PEFT | None = None,
packed_sequence: bool = False,
packed_sequence: bool = True,
# Training params
train_iters: int = 1000,
global_batch_size: int | None = None, # Auto-select based on packed_sequence if None
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/utils/finetune_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def default_peft_config(peft_scheme: str | PEFT | None, **kwargs) -> PEFT | None
raise ValueError(f"Invalid peft type: {type(peft_scheme)}. Expected str, PEFT instance, or None")


def default_squad_config(seq_length: int, packed_sequence: bool = False, pad_seq_to_mult: int = 1) -> HFDatasetConfig:
def default_squad_config(seq_length: int, packed_sequence: bool = True, pad_seq_to_mult: int = 1) -> HFDatasetConfig:
"""Create default SQuAD dataset configuration for finetuning recipes.

Args:
Expand Down
1 change: 1 addition & 0 deletions tests/unit_tests/recipes/test_qwen_recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ class _FakeModelCfg:

def __init__(self):
self.cross_entropy_fusion_impl = "native"
self.context_parallel_size = 1

def finalize(self):
# qwen3 recipe may call finalize(); make it a no-op
Expand Down