diff --git a/src/megatron/bridge/recipes/gemma/gemma2.py b/src/megatron/bridge/recipes/gemma/gemma2.py index e51c417934..a7de02bb87 100644 --- a/src/megatron/bridge/recipes/gemma/gemma2.py +++ b/src/megatron/bridge/recipes/gemma/gemma2.py @@ -340,7 +340,7 @@ def _gemma2_finetune_common( # Finetuning-specific params pretrained_checkpoint: Optional[str] = None, peft: Union[str, PEFT, None] = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 100, global_batch_size: Optional[int] = None, diff --git a/src/megatron/bridge/recipes/gemma/gemma3.py b/src/megatron/bridge/recipes/gemma/gemma3.py index 224a429fcc..d44d2ae582 100644 --- a/src/megatron/bridge/recipes/gemma/gemma3.py +++ b/src/megatron/bridge/recipes/gemma/gemma3.py @@ -376,7 +376,7 @@ def _gemma3_finetune_common( name: str = "default", # Finetuning-specific pretrained_checkpoint: str | None = None, - packed_sequence: bool = False, + packed_sequence: bool = True, # Training hyperparameters train_iters: int = 100, global_batch_size: int | None = None, diff --git a/src/megatron/bridge/recipes/glm/glm45.py b/src/megatron/bridge/recipes/glm/glm45.py index ad811a93b5..ee96009c5e 100644 --- a/src/megatron/bridge/recipes/glm/glm45.py +++ b/src/megatron/bridge/recipes/glm/glm45.py @@ -369,6 +369,7 @@ def glm45_355b_finetune_config(**user_kwargs: Unpack[GLM45FinetuneKwargs]) -> Co "expert_model_parallel_size": 16 if is_full_sft else 4, "peft": peft_value, "finetune_lr": 5e-6 if is_full_sft else 1e-4, + "packed_sequence": False, # Packed sequence is not supported for GLM 4.5 } kwargs: GLM45FinetuneKwargs = {**recommended, **user_kwargs} return _glm45_finetune_common(**kwargs) @@ -391,6 +392,7 @@ def glm45_air_106b_finetune_config(**user_kwargs: Unpack[GLM45FinetuneKwargs]) - "expert_model_parallel_size": 8 if is_full_sft else 4, "peft": peft_value, "finetune_lr": 5e-6 if is_full_sft else 1e-4, + "packed_sequence": False, # Packed sequence is not supported for GLM 4.5 } kwargs: GLM45FinetuneKwargs = {**recommended, **user_kwargs} return _glm45_finetune_common(**kwargs) @@ -412,7 +414,7 @@ def _glm45_finetune_common( # Finetuning-specific params pretrained_checkpoint: Optional[str] = None, peft: Optional[Union[str, PEFT]] = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int = 128, diff --git a/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py b/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py index 7afa301d4e..35085436b8 100644 --- a/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py +++ b/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py @@ -380,7 +380,7 @@ def _gpt_oss_finetune_common( # Finetuning-specific params pretrained_checkpoint: Optional[str] = None, peft: Optional[Union[str, PEFT]] = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int = 128, diff --git a/src/megatron/bridge/recipes/llama/llama3.py b/src/megatron/bridge/recipes/llama/llama3.py index 59d6a8631a..4d729b8d1b 100644 --- a/src/megatron/bridge/recipes/llama/llama3.py +++ b/src/megatron/bridge/recipes/llama/llama3.py @@ -922,7 +922,7 @@ def _llama3_finetune_common( name: str = "default", # Finetuning-specific params pretrained_checkpoint: str | None = None, - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int | None = None, diff --git a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py index 4bfa6d9b64..622cdeb0a2 100644 --- a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py +++ b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py @@ -519,7 +519,7 @@ def _moonlight_finetune_common( # Finetuning-specific params pretrained_checkpoint: Optional[str] = None, peft: Optional[Union[str, PEFT]] = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int = 128, diff --git a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py index 5e8a98a7a9..7c1aa19e84 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py @@ -399,7 +399,7 @@ def _nemotron_3_nano_finetune_common( # Finetuning-specific params pretrained_checkpoint: Optional[str] = None, peft: Optional[Union[str, PEFT]] = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int = 128, diff --git a/src/megatron/bridge/recipes/nemotronh/nemotronh.py b/src/megatron/bridge/recipes/nemotronh/nemotronh.py index fff3db3965..ed8b11eed2 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotronh.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotronh.py @@ -470,7 +470,7 @@ def _nemotronh_finetune_common( # Finetuning-specific params pretrained_checkpoint: str | None = None, peft: str | PEFT | None = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int = 128, @@ -508,7 +508,7 @@ def _nemotronh_finetune_common( sequence_parallelism: Whether to use sequence parallelism. pretrained_checkpoint: Path to pretrained checkpoint to load from. peft: PEFT configuration (e.g., "lora", "dora") or PEFT object. None for full SFT. Default: "lora". - packed_sequence: Whether to use packed sequences. Default: False. + packed_sequence: Whether to use packed sequences. Default: True. train_iters: Total number of training iterations. Default: 1000. global_batch_size: Global batch size. Default: 128. micro_batch_size: Micro batch size. Default: 1. diff --git a/src/megatron/bridge/recipes/olmoe/olmoe_7b.py b/src/megatron/bridge/recipes/olmoe/olmoe_7b.py index 52fac6f2b4..263d1219bf 100644 --- a/src/megatron/bridge/recipes/olmoe/olmoe_7b.py +++ b/src/megatron/bridge/recipes/olmoe/olmoe_7b.py @@ -495,7 +495,7 @@ def _olmoe_finetune_common( # Finetuning-specific params pretrained_checkpoint: Optional[str] = None, peft: Optional[Union[str, PEFT]] = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int = 128, diff --git a/src/megatron/bridge/recipes/qwen/qwen2.py b/src/megatron/bridge/recipes/qwen/qwen2.py index 7ca12204b8..af350095da 100644 --- a/src/megatron/bridge/recipes/qwen/qwen2.py +++ b/src/megatron/bridge/recipes/qwen/qwen2.py @@ -594,7 +594,7 @@ def _qwen2_finetune_common( # Finetuning-specific params pretrained_checkpoint: Optional[str] = None, peft: Union[str, PEFT, None] = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 100, global_batch_size: Optional[int] = None, diff --git a/src/megatron/bridge/recipes/qwen/qwen3.py b/src/megatron/bridge/recipes/qwen/qwen3.py index b12e4a2a42..4fec4318df 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3.py +++ b/src/megatron/bridge/recipes/qwen/qwen3.py @@ -511,7 +511,7 @@ def _qwen3_finetune_common( # Finetuning-specific params pretrained_checkpoint: str | None = None, peft: str | PEFT | None = "lora", - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int | None = None, # Auto-select based on packed_sequence if None diff --git a/src/megatron/bridge/recipes/qwen/qwen3_moe.py b/src/megatron/bridge/recipes/qwen/qwen3_moe.py index a2ae6b6350..369238c2a8 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3_moe.py +++ b/src/megatron/bridge/recipes/qwen/qwen3_moe.py @@ -472,7 +472,7 @@ def _qwen3_moe_finetune_common( name: str = "default", # Finetuning-specific pretrained_checkpoint: Optional[str] = None, - packed_sequence: bool = False, + packed_sequence: bool = True, # Training hyperparameters train_iters: int = 100, global_batch_size: Optional[int] = None, diff --git a/src/megatron/bridge/recipes/qwen/qwen3_next.py b/src/megatron/bridge/recipes/qwen/qwen3_next.py index 7dbc5583e4..ca894f8ceb 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3_next.py +++ b/src/megatron/bridge/recipes/qwen/qwen3_next.py @@ -378,6 +378,7 @@ def qwen3_next_80b_a3b_finetune_config(**user_kwargs: Unpack[Qwen3NextFinetuneKw "finetune_lr": 5e-6, "min_lr": 5e-6, "enable_recompute": True, + "packed_sequence": False, # Sequence packing is not supported for Qwen3-Next } combined_kwargs: Qwen3NextFinetuneKwargs = {**recommended_kwargs, **user_kwargs} config = _qwen3_next_finetune_common(**combined_kwargs) @@ -405,7 +406,7 @@ def _qwen3_next_finetune_common( # Finetuning-specific params pretrained_checkpoint: str | None = None, peft: str | PEFT | None = None, - packed_sequence: bool = False, + packed_sequence: bool = True, # Training params train_iters: int = 1000, global_batch_size: int | None = None, # Auto-select based on packed_sequence if None diff --git a/src/megatron/bridge/recipes/utils/finetune_utils.py b/src/megatron/bridge/recipes/utils/finetune_utils.py index bde88e03c3..df465dc459 100644 --- a/src/megatron/bridge/recipes/utils/finetune_utils.py +++ b/src/megatron/bridge/recipes/utils/finetune_utils.py @@ -50,7 +50,7 @@ def default_peft_config(peft_scheme: str | PEFT | None, **kwargs) -> PEFT | None raise ValueError(f"Invalid peft type: {type(peft_scheme)}. Expected str, PEFT instance, or None") -def default_squad_config(seq_length: int, packed_sequence: bool = False, pad_seq_to_mult: int = 1) -> HFDatasetConfig: +def default_squad_config(seq_length: int, packed_sequence: bool = True, pad_seq_to_mult: int = 1) -> HFDatasetConfig: """Create default SQuAD dataset configuration for finetuning recipes. Args: diff --git a/tests/unit_tests/recipes/test_qwen_recipes.py b/tests/unit_tests/recipes/test_qwen_recipes.py index 54232bce8a..cc09d1a0c4 100644 --- a/tests/unit_tests/recipes/test_qwen_recipes.py +++ b/tests/unit_tests/recipes/test_qwen_recipes.py @@ -98,6 +98,7 @@ class _FakeModelCfg: def __init__(self): self.cross_entropy_fusion_impl = "native" + self.context_parallel_size = 1 def finalize(self): # qwen3 recipe may call finalize(); make it a no-op