diff --git a/src/megatron/bridge/recipes/common.py b/src/megatron/bridge/recipes/common.py index 8ea8fe0179..c422baea5b 100644 --- a/src/megatron/bridge/recipes/common.py +++ b/src/megatron/bridge/recipes/common.py @@ -16,9 +16,11 @@ from megatron.core.distributed import DistributedDataParallelConfig +from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider from megatron.bridge.peft.lora import LoRA from megatron.bridge.recipes.utils.finetune_utils import default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing +from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -335,3 +337,210 @@ def _peft_common() -> ConfigContainer: ) return cfg + + +def _sft_common_vlm() -> ConfigContainer: + """Create a base SFT ConfigContainer with common defaults for Vision-Language Models. + + This function inherits from `_sft_common()` and overrides VLM-specific settings. + The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use. + + Key differences from LLM SFT (`_sft_common`): + - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2) + - Uses NullTokenizer (VLMs use processor instead of tokenizer) + - DDP config optimized for VLM training (no grad/param overlap) + - Supports freeze options for language_model, vision_model, vision_projection + - Different training defaults (train_iters=300000, GBS=32, MBS=2) + - Different RNG seed (1234) + + Returns: + ConfigContainer: Base configuration template for VLM full SFT. + """ + # Start from the LLM SFT common config + cfg = _sft_common() + + # Default output directories + base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") + run_output_dir = os.path.join(base_output_dir, "default") + checkpoint_dir = os.path.join(run_output_dir, "checkpoints") + tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + + # Default sequence length for VLM + seq_length = 4096 + + # VLM-specific training config - longer training with different batch sizes + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # VLM-specific validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # VLM-specific optimizer settings - higher LR for VLM training + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=None, # Defaults to train_iters during validation + max_lr=3e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # VLM-specific DDP config - no overlap for VLMs + cfg.ddp = DistributedDataParallelConfig( + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=False, + overlap_param_gather=False, + average_in_collective=True, + data_parallel_sharding_strategy="optim_grads_params", + use_distributed_optimizer=True, + ) + + # VLM-specific dataset - uses HuggingFace dataset provider + # hf_processor_path must be set by model-specific config + cfg.dataset = HFDatasetConversationProvider( + seq_length=seq_length, + hf_processor_path=None, # Must be set by model-specific config + maker_name="make_cord_v2_dataset", + num_workers=2, + dataloader_type="single", + data_sharding=True, + pin_memory=True, + persistent_workers=False, + pack_sequences_in_batch=True, + ) + + # VLM uses NullTokenizer - actual tokenization is handled by the processor + cfg.tokenizer = TokenizerConfig( + tokenizer_type="NullTokenizer", + vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE, + ) + + # VLM-specific logger config + cfg.logger = LoggerConfig( + log_interval=10, + tensorboard_dir=tensorboard_dir, + log_timers_to_tensorboard=True, + ) + + # VLM-specific checkpoint config + cfg.checkpoint.save_interval = 500 + cfg.checkpoint.save = checkpoint_dir + cfg.checkpoint.load = checkpoint_dir + cfg.checkpoint.ckpt_format = "torch_dist" + cfg.checkpoint.fully_parallel_save = True + + # VLM uses different RNG seed + cfg.rng = RNGConfig(seed=1234) + + return cfg + + +def _peft_common_vlm() -> ConfigContainer: + """Create a base PEFT ConfigContainer with LoRA defaults for Vision-Language Models. + + This function inherits from `_peft_common()` and overrides VLM-specific settings. + The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use. + + Key differences from LLM PEFT (`_peft_common`): + - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2) + - Uses NullTokenizer (VLMs use processor instead of tokenizer) + - DDP config optimized for VLM training (no grad/param overlap) + - Supports freeze options for language_model, vision_model, vision_projection + - Different training defaults (train_iters=300000, GBS=32, MBS=2) + - Different RNG seed (1234) + - Higher LR (1e-4) for adapter training + + Returns: + ConfigContainer: Base configuration template for VLM PEFT with LoRA. + """ + # Start from the LLM PEFT common config + cfg = _peft_common() + + # Default output directories + base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") + run_output_dir = os.path.join(base_output_dir, "default") + checkpoint_dir = os.path.join(run_output_dir, "checkpoints") + tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + + # Default sequence length for VLM + seq_length = 4096 + + # VLM-specific training config - longer training with different batch sizes + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # VLM-specific validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # VLM-specific optimizer settings - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=None, # Defaults to train_iters during validation + max_lr=1e-4, # Higher LR for adapter training + min_lr=1e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # VLM-specific DDP config - no overlap for VLMs + cfg.ddp = DistributedDataParallelConfig( + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=False, + overlap_param_gather=False, + average_in_collective=True, + data_parallel_sharding_strategy="optim_grads_params", + use_distributed_optimizer=True, + ) + + # VLM-specific dataset - uses HuggingFace dataset provider + # hf_processor_path must be set by model-specific config + cfg.dataset = HFDatasetConversationProvider( + seq_length=seq_length, + hf_processor_path=None, # Must be set by model-specific config + maker_name="make_cord_v2_dataset", + num_workers=2, + dataloader_type="single", + data_sharding=True, + pin_memory=True, + persistent_workers=False, + pack_sequences_in_batch=True, + ) + + # VLM uses NullTokenizer - actual tokenization is handled by the processor + cfg.tokenizer = TokenizerConfig( + tokenizer_type="NullTokenizer", + vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE, + ) + + # VLM-specific logger config + cfg.logger = LoggerConfig( + log_interval=10, + tensorboard_dir=tensorboard_dir, + log_timers_to_tensorboard=True, + ) + + # VLM-specific checkpoint config + cfg.checkpoint.save_interval = 500 + cfg.checkpoint.save = checkpoint_dir + cfg.checkpoint.load = checkpoint_dir + cfg.checkpoint.ckpt_format = "torch_dist" + cfg.checkpoint.fully_parallel_save = True + + # VLM uses different RNG seed + cfg.rng = RNGConfig(seed=1234) + + # Keep LoRA config from _peft_common() - it's already set with standard defaults + + return cfg diff --git a/src/megatron/bridge/recipes/gemma3_vl/__init__.py b/src/megatron/bridge/recipes/gemma3_vl/__init__.py index de402836e9..f560a6c109 100644 --- a/src/megatron/bridge/recipes/gemma3_vl/__init__.py +++ b/src/megatron/bridge/recipes/gemma3_vl/__init__.py @@ -13,14 +13,20 @@ # limitations under the License. from megatron.bridge.recipes.gemma3_vl.gemma3_vl import ( - gemma3_vl_4b_finetune_config, - gemma3_vl_12b_finetune_config, - gemma3_vl_27b_finetune_config, + gemma3_vl_4b_peft_config, + gemma3_vl_4b_sft_config, + gemma3_vl_12b_peft_config, + gemma3_vl_12b_sft_config, + gemma3_vl_27b_peft_config, + gemma3_vl_27b_sft_config, ) __all__ = [ - "gemma3_vl_4b_finetune_config", - "gemma3_vl_12b_finetune_config", - "gemma3_vl_27b_finetune_config", + "gemma3_vl_4b_sft_config", + "gemma3_vl_12b_sft_config", + "gemma3_vl_27b_sft_config", + "gemma3_vl_4b_peft_config", + "gemma3_vl_12b_peft_config", + "gemma3_vl_27b_peft_config", ] diff --git a/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py b/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py index 7e29af13ab..66cc3b7d66 100644 --- a/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py +++ b/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py @@ -12,339 +12,679 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import List, Optional, Union +"""Gemma3-VL finetuning recipes with parameterless API. + +This module provides SFT and PEFT configurations for Gemma3-VL models (4B, 12B, 27B). +""" import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider -from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider -from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider from megatron.bridge.peft.base import PEFT +from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm from megatron.bridge.recipes.utils.finetune_utils import default_peft_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DatasetProvider, - DistributedDataParallelConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig - - -class Gemma3VLCommonKwargs(TypedDict, total=False): - """Typed options accepted by Gemma3-VL recipe helper functions.""" - - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - dataset_type: Optional[str] - image_folder: Optional[str] - tokenizer_model: Optional[str] +from megatron.bridge.training.config import ConfigContainer + + +# ============================================================================= +# Gemma3-VL 4B SFT Configuration +# ============================================================================= +def gemma3_vl_4b_sft_config() -> ConfigContainer: + """Return a full SFT config for Gemma3-VL 4B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 + """ + cfg = _sft_common_vlm() + # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - # Freeze options - freeze_language_model: bool - freeze_vision_model: bool - freeze_vision_projection: bool - # Checkpoint options - pretrained_checkpoint: Optional[str] - # PEFT options - peft: Optional[Union[str, PEFT]] - finetune_lr: float - # W&B logging - wandb_project: Optional[str] - wandb_entity: Optional[str] - wandb_exp_name: Optional[str] - - -def gemma3_vl_4b_finetune_config(**user_kwargs: Unpack[Gemma3VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Gemma3-VL 4B Instruct. + hf_path = "google/gemma-3-4b-it" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + cfg.model.cp_comm_type = "a2a" + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - VLMs require no overlap + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Gemma3-VL 12B SFT Configuration +# ============================================================================= +def gemma3_vl_12b_sft_config() -> ConfigContainer: + """Return a full SFT config for Gemma3-VL 12B Instruct. Default configuration: 1 node, 8 GPUs - - LoRA/DoRA: TP=1, PP=1, LR=1e-4 - - Full SFT: TP=1, PP=1, LR=5e-6 + - TP=4, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 + """ + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "google/gemma-3-12b-it" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + cfg.model.cp_comm_type = "a2a" + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 - See `_gemma3_vl_common` for the full list of parameters. + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - VLMs require no overlap + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Gemma3-VL 27B SFT Configuration +# ============================================================================= +def gemma3_vl_27b_sft_config() -> ConfigContainer: + """Return a full SFT config for Gemma3-VL 27B Instruct. + + Default configuration: 2 nodes, 16 GPUs total + - TP=8, PP=2 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + cfg = _sft_common_vlm() - recommended_kwargs: Gemma3VLCommonKwargs = { - "hf_path": "google/gemma-3-4b-it", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Gemma3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _gemma3_vl_common(**combined_kwargs) + # Model configuration + hf_path = "google/gemma-3-27b-it" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + # Parallel settings + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + cfg.model.cp_comm_type = "a2a" + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - VLMs require no overlap + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg -def gemma3_vl_12b_finetune_config(**user_kwargs: Unpack[Gemma3VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Gemma3-VL 12B Instruct. + +# ============================================================================= +# Gemma3-VL 4B PEFT Configuration +# ============================================================================= +def gemma3_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Gemma3-VL 4B Instruct. Default configuration: 1 node, 8 GPUs - - LoRA/DoRA: TP=1, PP=1, LR=1e-4 - - Full SFT: TP=4, PP=1, LR=5e-6 + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 - See `_gemma3_vl_common` for the full list of parameters. + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + cfg = _peft_common_vlm() - recommended_kwargs: Gemma3VLCommonKwargs = { - "hf_path": "google/gemma-3-12b-it", - "tensor_model_parallel_size": 4 if is_full_sft else 1, - "pipeline_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Gemma3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _gemma3_vl_common(**combined_kwargs) + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + # Model configuration + hf_path = "google/gemma-3-4b-it" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 -def gemma3_vl_27b_finetune_config(**user_kwargs: Unpack[Gemma3VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Gemma3-VL 27B Instruct. + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False - Default configuration: 2 nodes, 16 GPUs total - - LoRA/DoRA: TP=4, PP=1, LR=1e-4 - - Full SFT: TP=8, PP=2, LR=5e-6 + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + cfg.model.cp_comm_type = "a2a" + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.0002, + min_lr=0.00002, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 - See `_gemma3_vl_common` for the full list of parameters. - """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Gemma3VLCommonKwargs = { - "hf_path": "google/gemma-3-27b-it", - "tensor_model_parallel_size": 8 if is_full_sft else 4, - "pipeline_model_parallel_size": 2 if is_full_sft else 1, - "pipeline_dtype": torch.bfloat16 if is_full_sft else None, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Gemma3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _gemma3_vl_common(**combined_kwargs) - - -def _gemma3_vl_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "gemma3_vl_finetune", - pretrained_checkpoint: Optional[str] = None, # Dataset configuration - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - dataset_type: Optional[str] = None, - image_folder: Optional[str] = None, - tokenizer_model: Optional[str] = None, - # Model configuration - tensor_model_parallel_size: int = 2, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - # Precision and comm overlap - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Freeze options - freeze_language_model: bool = False, - freeze_vision_model: bool = False, - freeze_vision_projection: bool = False, - # PEFT options - peft: Optional[Union[str, PEFT]] = None, - finetune_lr: Optional[float] = None, - # W&B logging - wandb_project: Optional[str] = None, - wandb_entity: Optional[str] = None, - wandb_exp_name: Optional[str] = None, -) -> ConfigContainer: - """ - Create a fine-tuning configuration for Gemma3-VL models using a given HuggingFace path. + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - VLMs require no overlap + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False - The dataset pipeline is based on the Gemma3-VL architecture. To train multimodal tokens, - ensure your preprocessed data includes appropriate image placeholders. + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Gemma3-VL 12B PEFT Configuration +# ============================================================================= +def gemma3_vl_12b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Gemma3-VL 12B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 (lower than SFT for PEFT) + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Build provider via AutoBridge and set parallel/seq params here - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.freeze_language_model = freeze_language_model - model_cfg.freeze_vision_model = freeze_vision_model - model_cfg.freeze_vision_projection = freeze_vision_projection - model_cfg.seq_length = seq_length - model_cfg.cp_comm_type = "a2a" - - # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr - effective_lr = finetune_lr if finetune_lr is not None else lr - if min_lr > effective_lr: - min_lr = effective_lr * 0.1 - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters, - max_lr=effective_lr, - min_lr=min_lr, + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "google/gemma-3-12b-it" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + cfg.model.cp_comm_type = "a2a" + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.0002, + min_lr=0.00002, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - VLMs require no overlap + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Gemma3-VL 27B PEFT Configuration +# ============================================================================= +def gemma3_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Gemma3-VL 27B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=4, PP=1 (lower than SFT for PEFT) + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() - # PEFT config - peft_config = default_peft_config(peft) - - # Determine dataset selection strategy. - _dataset_choice = dataset_type or "hf" - _processor_model = tokenizer_model or hf_path - - if _dataset_choice == "mock": - dataset_cfg: DatasetProvider = MockVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - prompt="Describe this image.", - random_seed=0, - image_size=(256, 256), - pad_to_max_length=True, - create_attention_mask=True, - num_images=1, - dataloader_type="single", - ) - elif _dataset_choice == "hf": - dataset_cfg = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) - elif _dataset_choice == "preloaded": - dataset_cfg = PreloadedVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, - valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path, - test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path, - image_folder=image_folder, - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) else: - raise ValueError( - f"Unsupported dataset_type '{_dataset_choice}'. Currently only 'mock' is supported for Gemma3-VL." - ) - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=eval_interval, - eval_iters=32, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - wandb_project=wandb_project, - wandb_entity=wandb_entity, - wandb_exp_name=wandb_exp_name, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE), - checkpoint=CheckpointConfig( - pretrained_checkpoint=pretrained_checkpoint, - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - peft=peft_config, - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + cfg.peft = peft_scheme + + # Model configuration + hf_path = "google/gemma-3-27b-it" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP and PP for PEFT + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + cfg.model.cp_comm_type = "a2a" + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.0002, + min_lr=0.00002, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - VLMs require no overlap + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" return cfg diff --git a/src/megatron/bridge/recipes/glm_vl/__init__.py b/src/megatron/bridge/recipes/glm_vl/__init__.py index 0e61b8f27c..96edc1c446 100644 --- a/src/megatron/bridge/recipes/glm_vl/__init__.py +++ b/src/megatron/bridge/recipes/glm_vl/__init__.py @@ -12,9 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .glm_45v import glm_45v_finetune_config +from .glm_45v import ( + glm_45v_peft_config, + glm_45v_sft_config, + set_glm_45v_pipeline_model_parallel_layout, +) __all__ = [ - "glm_45v_finetune_config", + "glm_45v_sft_config", + "glm_45v_peft_config", + "set_glm_45v_pipeline_model_parallel_layout", ] diff --git a/src/megatron/bridge/recipes/glm_vl/glm_45v.py b/src/megatron/bridge/recipes/glm_vl/glm_45v.py index 5536f6e83e..fb53b5b3dc 100644 --- a/src/megatron/bridge/recipes/glm_vl/glm_45v.py +++ b/src/megatron/bridge/recipes/glm_vl/glm_45v.py @@ -12,36 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +"""GLM-4.5V finetuning recipes with parameterless API. + +This module provides SFT and PEFT configurations for GLM-4.5V (106B MoE). +""" + from typing import List, Optional, Union import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.data.vlm_datasets import ( - HFDatasetConversationProvider, - MockVLMConversationProvider, - PreloadedVLMConversationProvider, -) from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.peft.base import PEFT +from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm from megatron.bridge.recipes.utils.finetune_utils import default_peft_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DatasetProvider, - DistributedDataParallelConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig +from megatron.bridge.training.config import ConfigContainer def set_glm_45v_pipeline_model_parallel_layout( @@ -95,287 +81,296 @@ def set_glm_45v_pipeline_model_parallel_layout( model_cfg.pipeline_model_parallel_layout = layout_map[(pp_size, vp_size)] -class GLM45VCommonKwargs(TypedDict, total=False): - """Typed options accepted by GLM-4.5V recipe helper functions.""" +# ============================================================================= +# GLM-4.5V SFT Configuration +# ============================================================================= +def glm_45v_sft_config() -> ConfigContainer: + """Return a full SFT config for GLM-4.5V (106B MoE). - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - dataset_type: Optional[str] - image_folder: Optional[str] - tokenizer_model: Optional[str] - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - expert_model_parallel_size: int - context_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - # Freeze options - freeze_language_model: bool - freeze_vision_model: bool - freeze_vision_projection: bool - # Checkpoint options - pretrained_checkpoint: Optional[str] - # Pipeline layout - layout: Optional[Union[str, List[List[str]]]] - # PEFT options - peft: Optional[Union[str, PEFT]] - finetune_lr: float - # W&B logging - wandb_project: Optional[str] - wandb_entity: Optional[str] - wandb_exp_name: Optional[str] - - -def glm_45v_finetune_config(**user_kwargs: Unpack[GLM45VCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for GLM-4.5V (based on GLM-4.5 Air 106B). - - Default configuration: - - LoRA/DoRA: TP=1, PP=8, EP=4 (64 GPUs, 8 nodes), LR=1e-4 - - Full SFT: TP=1, PP=8, EP=16 (512 GPUs, 64 nodes), LR=5e-6 - - GLM-4.5V is a Vision-Language model with: - - 106B total parameters (based on GLM-4.5 Air) - - Sparse MoE with shared experts - - Multi-modality support for images and videos - - See `_glm_45v_common` for the full list of parameters. + Default configuration: 64 nodes, 512 GPUs + - TP=1, PP=8, EP=16 + - LR=5e-6 (full SFT) + - Sequence length: 8192 """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: GLM45VCommonKwargs = { - "hf_path": "zai-org/GLM-4.5V", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 8, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 16 if is_full_sft else 4, - "global_batch_size": 64 if is_full_sft else 32, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: GLM45VCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _glm_45v_common(**combined_kwargs) - - -def _glm_45v_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "glm_45v_finetune", - pretrained_checkpoint: Optional[str] = None, - # Dataset configuration - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - dataset_type: Optional[str] = None, - image_folder: Optional[str] = None, - tokenizer_model: Optional[str] = None, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 2, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - expert_model_parallel_size: int = 4, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 1, - seq_length: int = 8192, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - # Precision and comm overlap - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Freeze options - freeze_language_model: bool = False, - freeze_vision_model: bool = False, - freeze_vision_projection: bool = False, - # Pipeline layout - layout: Optional[Union[str, List[List[str]]]] = None, - # PEFT options - peft: Optional[Union[str, PEFT]] = None, - finetune_lr: Optional[float] = None, - # W&B logging - wandb_project: Optional[str] = None, - wandb_entity: Optional[str] = None, - wandb_exp_name: Optional[str] = None, -) -> ConfigContainer: - """ - Create a fine-tuning configuration for GLM-4.5V models using a given HuggingFace path. + cfg = _sft_common_vlm() - The dataset pipeline is conversation-based. To train multimodal tokens, ensure your - preprocessed data includes placeholders (e.g., ) as needed. + # Model configuration + hf_path = "zai-org/GLM-4.5V" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 8192 - GLM-4.5V is a Vision-Language model based on GLM-4.5 Air (106B parameters) with: - - Sparse MoE architecture with shared experts - - Multi-modal support for images and videos - - MRoPE (Multi-Resolution Rotary Position Embedding) - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Build provider via AutoBridge and set parallel/seq params here - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.freeze_language_model = freeze_language_model - model_cfg.freeze_vision_model = freeze_vision_model - model_cfg.freeze_vision_projection = freeze_vision_projection - model_cfg.seq_length = seq_length + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 8 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 16 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False # Set pipeline model parallel layout for asymmetric stages - set_glm_45v_pipeline_model_parallel_layout(model_cfg, layout, is_peft=peft is not None) + set_glm_45v_pipeline_model_parallel_layout(cfg.model, layout=None, is_peft=False) # Pipeline split for asymmetric stages are specified with the layout above - model_cfg.account_for_embedding_in_pipeline_split = False - model_cfg.account_for_loss_in_pipeline_split = False - model_cfg.num_layers_in_first_pipeline_stage = None - model_cfg.num_layers_in_last_pipeline_stage = None - - # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr - # Ensure min_lr does not exceed max_lr (use 10% of effective_lr as default min) - effective_lr = finetune_lr if finetune_lr is not None else lr - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters, - max_lr=effective_lr, - min_lr=min(min_lr, effective_lr * 0.1), + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + cfg.model.num_layers_in_first_pipeline_stage = None + cfg.model.num_layers_in_last_pipeline_stage = None + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (MoE) + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = True + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 64 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=5e-6, + min_lr=5e-6 * 0.1, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 8192 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - GLM-4.5V specific settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # FP8 settings (uncomment to use FP8) + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + - # PEFT config - peft_config = default_peft_config(peft) - - # Determine dataset selection strategy. - _dataset_choice = dataset_type or "hf" - _processor_model = tokenizer_model or hf_path - - if _dataset_choice == "mock": - dataset_cfg: DatasetProvider = MockVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - prompt="Describe this image.", - num_workers=1, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - create_attention_mask=True, - pad_to_max_length=True, - ) - elif _dataset_choice == "preloaded": - dataset_cfg = PreloadedVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, - valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path, - test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path, - image_folder=image_folder, - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) - elif _dataset_choice == "hf": - dataset_cfg = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) +# ============================================================================= +# GLM-4.5V PEFT Configuration +# ============================================================================= +def glm_45v_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for GLM-4.5V (106B MoE). + + Default configuration: 8 nodes, 64 GPUs + - TP=1, PP=8, EP=4 + - LR=1e-4 (PEFT) + - Sequence length: 8192 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) else: - raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].") - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=eval_interval, - eval_iters=32, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - wandb_project=wandb_project, - wandb_entity=wandb_entity, - wandb_exp_name=wandb_exp_name, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE), - checkpoint=CheckpointConfig( - pretrained_checkpoint=pretrained_checkpoint, - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - peft=peft_config, - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + cfg.peft = peft_scheme + + # Model configuration + hf_path = "zai-org/GLM-4.5V" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 8192 + + # Parallel settings - lower EP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 8 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 4 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # Set pipeline model parallel layout for asymmetric stages + set_glm_45v_pipeline_model_parallel_layout(cfg.model, layout=None, is_peft=True) + + # Pipeline split for asymmetric stages are specified with the layout above + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + cfg.model.num_layers_in_first_pipeline_stage = None + cfg.model.num_layers_in_last_pipeline_stage = None + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (MoE) + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = True + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=1e-4, + min_lr=1e-5, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 8192 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - GLM-4.5V specific settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # FP8 settings (uncomment to use FP8) + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" return cfg diff --git a/src/megatron/bridge/recipes/ministral3/__init__.py b/src/megatron/bridge/recipes/ministral3/__init__.py index 195375c9bb..0e062c0ec5 100644 --- a/src/megatron/bridge/recipes/ministral3/__init__.py +++ b/src/megatron/bridge/recipes/ministral3/__init__.py @@ -14,15 +14,22 @@ # Ministral3 models from .ministral3 import ( - ministral3_3b_finetune_config, - ministral3_8b_finetune_config, - ministral3_14b_finetune_config, + ministral3_3b_peft_config, + ministral3_3b_sft_config, + ministral3_8b_peft_config, + ministral3_8b_sft_config, + ministral3_14b_peft_config, + ministral3_14b_sft_config, ) __all__ = [ - # Ministral3 models - "ministral3_3b_finetune_config", - "ministral3_8b_finetune_config", - "ministral3_14b_finetune_config", + # Ministral3 SFT configs + "ministral3_3b_sft_config", + "ministral3_8b_sft_config", + "ministral3_14b_sft_config", + # Ministral3 PEFT configs + "ministral3_3b_peft_config", + "ministral3_8b_peft_config", + "ministral3_14b_peft_config", ] diff --git a/src/megatron/bridge/recipes/ministral3/ministral3.py b/src/megatron/bridge/recipes/ministral3/ministral3.py index c52c17ecf7..8d6752b750 100644 --- a/src/megatron/bridge/recipes/ministral3/ministral3.py +++ b/src/megatron/bridge/recipes/ministral3/ministral3.py @@ -12,336 +12,709 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import List, Optional, Union +"""Ministral3 finetuning recipes with parameterless API. + +This module provides SFT and PEFT configurations for Ministral3 models (3B, 8B, 14B). +""" import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.data.vlm_datasets import ( - HFDatasetConversationProvider, - MockVLMConversationProvider, - PreloadedVLMConversationProvider, -) from megatron.bridge.peft.base import PEFT +from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm from megatron.bridge.recipes.utils.finetune_utils import default_peft_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DatasetProvider, - DistributedDataParallelConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig - - -class Ministral3FinetuneKwargs(TypedDict, total=False): - """Typed options accepted by Ministral3 finetuning recipe helper functions.""" - - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - dataset_type: Optional[str] - image_folder: Optional[str] - tokenizer_model: Optional[str] - seq_length: Optional[int] +from megatron.bridge.training.config import ConfigContainer + + +# ============================================================================= +# Ministral3 3B SFT Configuration +# ============================================================================= +def ministral3_3b_sft_config() -> ConfigContainer: + """Return a full SFT config for Ministral3 3B. + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 + """ + cfg = _sft_common_vlm() + # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - # Training hyperparameters - train_iters: int - global_batch_size: Optional[int] - micro_batch_size: int - eval_interval: int - save_interval: int - # Optimizer - finetune_lr: Optional[float] - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - # Freeze options - freeze_language_model: bool - freeze_vision_model: bool - freeze_vision_projection: bool - # Checkpoint options - pretrained_checkpoint: Optional[str] - # PEFT options - peft: Optional[Union[str, PEFT]] - # W&B logging - wandb_project: Optional[str] - wandb_entity: Optional[str] - wandb_exp_name: Optional[str] - - -def ministral3_3b_finetune_config(**user_kwargs: Unpack[Ministral3FinetuneKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Ministral3 3B. + hf_path = "mistralai/Ministral-3-3B-Instruct-2512" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 50 + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Ministral3 8B SFT Configuration +# ============================================================================= +def ministral3_8b_sft_config() -> ConfigContainer: + """Return a full SFT config for Ministral3 8B. Default configuration: 1 node, 8 GPUs - - LoRA/DoRA (default): TP=1, PP=1, LR=1e-4 - - Full SFT (peft=None): TP=1, PP=1, LR=5e-6 + - TP=2, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 + """ + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "mistralai/Ministral-3-8B-Instruct-2512" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 - See `_ministral3_finetune_common` for the full list of parameters. + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 50 + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Ministral3 14B SFT Configuration +# ============================================================================= +def ministral3_14b_sft_config() -> ConfigContainer: + """Return a full SFT config for Ministral3 14B. + + Default configuration: 1 node, 8 GPUs + - TP=4, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", "lora") - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + cfg = _sft_common_vlm() - recommended_kwargs: Ministral3FinetuneKwargs = { - "hf_path": "mistralai/Ministral-3-3B-Instruct-2512", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Ministral3FinetuneKwargs = {**recommended_kwargs, **user_kwargs} - return _ministral3_finetune_common(**combined_kwargs) + # Model configuration + hf_path = "mistralai/Ministral-3-14B-Instruct-2512" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" -def ministral3_8b_finetune_config(**user_kwargs: Unpack[Ministral3FinetuneKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Ministral3 8B. + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 50 + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Ministral3 3B PEFT Configuration +# ============================================================================= +def ministral3_3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Ministral3 3B. Default configuration: 1 node, 8 GPUs - - LoRA/DoRA (default): TP=1, PP=1, LR=1e-4 - - Full SFT (peft=None): TP=2, PP=1, LR=5e-6 + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 - See `_ministral3_finetune_common` for the full list of parameters. + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", "lora") - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "mistralai/Ministral-3-3B-Instruct-2512" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.0002, + min_lr=0.00002, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 50 - recommended_kwargs: Ministral3FinetuneKwargs = { - "hf_path": "mistralai/Ministral-3-8B-Instruct-2512", - "tensor_model_parallel_size": 2 if is_full_sft else 1, - "pipeline_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Ministral3FinetuneKwargs = {**recommended_kwargs, **user_kwargs} - return _ministral3_finetune_common(**combined_kwargs) + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg -def ministral3_14b_finetune_config(**user_kwargs: Unpack[Ministral3FinetuneKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Ministral3 14B. + +# ============================================================================= +# Ministral3 8B PEFT Configuration +# ============================================================================= +def ministral3_8b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Ministral3 8B. Default configuration: 1 node, 8 GPUs - - LoRA/DoRA (default): TP=2, PP=1, LR=1e-4 - - Full SFT (peft=None): TP=4, PP=1, LR=5e-6 + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 - See `_ministral3_finetune_common` for the full list of parameters. + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", "lora") - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Ministral3FinetuneKwargs = { - "hf_path": "mistralai/Ministral-3-14B-Instruct-2512", - "tensor_model_parallel_size": 4 if is_full_sft else 2, - "pipeline_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Ministral3FinetuneKwargs = {**recommended_kwargs, **user_kwargs} - return _ministral3_finetune_common(**combined_kwargs) - - -def _ministral3_finetune_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "ministral3_finetune", - pretrained_checkpoint: Optional[str] = None, - # Dataset configuration - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - dataset_type: Optional[str] = None, - image_folder: Optional[str] = None, - tokenizer_model: Optional[str] = None, + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - # Training hyperparameters - train_iters: int = 1000, - global_batch_size: int = 32, - micro_batch_size: int = 1, - seq_length: int = 4096, - eval_interval: int = 30, - save_interval: int = 50, - # Optimizer - finetune_lr: Optional[float] = None, - min_lr: float = 0.0, - lr_warmup_iters: int = 50, - lr_decay_iters: Optional[int] = None, - # Precision and comm overlap - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Freeze options - freeze_language_model: bool = False, - freeze_vision_model: bool = False, - freeze_vision_projection: bool = False, - # PEFT options - peft: Optional[Union[str, PEFT]] = None, - # W&B logging - wandb_project: Optional[str] = None, - wandb_entity: Optional[str] = None, - wandb_exp_name: Optional[str] = None, -) -> ConfigContainer: - """ - Create a fine-tuning configuration for Ministral3 family models using a given HuggingFace path. + hf_path = "mistralai/Ministral-3-8B-Instruct-2512" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 - The dataset pipeline is conversation-based. To train multimodal tokens, ensure your - preprocessed data includes placeholders (e.g., ) as needed. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Build provider via AutoBridge and set parallel/seq params here - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.freeze_language_model = freeze_language_model - model_cfg.freeze_vision_model = freeze_vision_model - model_cfg.freeze_vision_projection = freeze_vision_projection - model_cfg.seq_length = seq_length - - # Optimizer and scheduler - use finetune_lr if provided, otherwise use default - effective_lr = finetune_lr if finetune_lr is not None else 1e-4 - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters, - max_lr=effective_lr, - min_lr=min_lr, + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.0002, + min_lr=0.00002, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 50 + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" - # PEFT config - peft_config = default_peft_config(peft) - - # Determine dataset selection strategy - _dataset_choice = dataset_type or "hf" - _processor_model = tokenizer_model or hf_path - - if _dataset_choice == "mock": - dataset_cfg: DatasetProvider = MockVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - prompt="Describe this image.", - num_workers=1, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - create_attention_mask=True, - pad_to_max_length=True, - ) - elif _dataset_choice == "preloaded": - dataset_cfg = PreloadedVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, - valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path, - test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path, - image_folder=image_folder, - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) - elif _dataset_choice == "hf": - dataset_cfg = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) + return cfg + + +# ============================================================================= +# Ministral3 14B PEFT Configuration +# ============================================================================= +def ministral3_14b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Ministral3 14B. + + Default configuration: 1 node, 8 GPUs + - TP=2, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) else: - raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].") - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=eval_interval, - eval_iters=32, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - wandb_project=wandb_project, - wandb_entity=wandb_entity, - wandb_exp_name=wandb_exp_name, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE), - checkpoint=CheckpointConfig( - pretrained_checkpoint=pretrained_checkpoint, - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - peft=peft_config, - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + cfg.peft = peft_scheme + + # Model configuration + hf_path = "mistralai/Ministral-3-14B-Instruct-2512" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 5 + cfg.validation.eval_iters = 10 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.0002, + min_lr=0.00002, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 50 + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" return cfg diff --git a/src/megatron/bridge/recipes/nemotron_vl/__init__.py b/src/megatron/bridge/recipes/nemotron_vl/__init__.py index 0de786c5ef..4fe909096e 100644 --- a/src/megatron/bridge/recipes/nemotron_vl/__init__.py +++ b/src/megatron/bridge/recipes/nemotron_vl/__init__.py @@ -13,12 +13,12 @@ # limitations under the License. from .nemotron_nano_v2_vl import ( - nemotron_nano_v2_vl_12b_finetune_config, - nemotron_nano_v2_vl_12b_pretrain_config, + nemotron_nano_v2_vl_12b_peft_config, + nemotron_nano_v2_vl_12b_sft_config, ) __all__ = [ - "nemotron_nano_v2_vl_12b_pretrain_config", - "nemotron_nano_v2_vl_12b_finetune_config", + "nemotron_nano_v2_vl_12b_sft_config", + "nemotron_nano_v2_vl_12b_peft_config", ] diff --git a/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py b/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py index c1824a336d..1ae2f13522 100644 --- a/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py +++ b/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py @@ -12,240 +12,259 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import Optional, Union +"""Nemotron Nano V2 VL finetuning recipes with parameterless API. + +This module provides SFT and PEFT configurations for Nemotron Nano V2 VL 12B. +""" import torch from megatron.bridge import AutoBridge -from megatron.bridge.data.vlm_datasets import ( - HFDatasetConversationProvider, -) -from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider +from megatron.bridge.peft.base import PEFT from megatron.bridge.peft.lora import VLMLoRA +from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedDataParallelConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig - - -def nemotron_nano_v2_vl_12b_pretrain_config( - dir: Optional[str] = None, - name: str = "nemotron_nano_v2_vl_pretrain", - hf_model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", - # Dataset configuration - dataset_type: Optional[str] = None, - mock: bool = False, - dataset_maker_name: str = "make_cord_v2_dataset", - # Model configuration - tensor_parallelism: int = 4, - pipeline_parallelism: int = 1, - pipeline_parallelism_dtype: Optional[torch.dtype] = None, - virtual_pipeline_parallelism: Optional[int] = None, - context_parallelism: int = 1, - sequence_parallelism: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - # Precision and comm overlap - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Checkpointing - save_interval: Optional[int] = 200, -) -> ConfigContainer: - """ - Create a pre-training configuration for Nemotron Nano V2 VL. +from megatron.bridge.training.config import ConfigContainer + + +# ============================================================================= +# Nemotron Nano V2 VL 12B SFT Configuration +# ============================================================================= +def nemotron_nano_v2_vl_12b_sft_config() -> ConfigContainer: + """Return a full SFT config for Nemotron Nano V2 VL 12B. - Note: Current dataset pipeline is text-centric. To train multimodal tokens, - your preprocessed data should include placeholder tokens (e.g., ) as needed. + Default configuration: 1 node, 8 GPUs + - TP=4, PP=1 + - LR=1e-5 (finetune default) + - Sequence length: 4096 """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Build provider via AutoBridge and set parallel/seq params here - bridge = AutoBridge.from_hf_pretrained(hf_model_path, trust_remote_code=True) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_parallelism - model_cfg.pipeline_model_parallel_size = pipeline_parallelism - model_cfg.pipeline_dtype = pipeline_parallelism_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism - model_cfg.context_parallel_size = context_parallelism - model_cfg.sequence_parallel = sequence_parallelism - model_cfg.seq_length = seq_length - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" + cfg.model = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=True).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 2000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 0 + + # Optimizer - finetune defaults + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=5, + lr_decay_iters=None, + max_lr=2e-5, + min_lr=2e-6, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 - # Dataset provider selection - _dataset_choice = (dataset_type or ("mock" if mock else "hf")).lower() + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - Nemotron uses average_in_collective=False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 200 - if _dataset_choice == "mock": - dataset_cfg = MockVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=hf_model_path, - dataloader_type="single", + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Nemotron Nano V2 VL 12B PEFT Configuration +# ============================================================================= +def nemotron_nano_v2_vl_12b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Nemotron Nano V2 VL 12B. + + Default configuration: 1 node, 8 GPUs + - TP=2, PP=1 + - LR=5e-5 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + Note: Default uses VLMLoRA targeting all model components. + """ + cfg = _peft_common_vlm() + + # PEFT scheme - Nemotron uses VLMLoRA by default + if isinstance(peft_scheme, str) and peft_scheme.lower() == "lora": + cfg.peft = VLMLoRA( + target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], + dim=16, + alpha=32, ) - elif _dataset_choice == "hf": - dataset_cfg = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=hf_model_path, - maker_name=dataset_maker_name, - # Dataloader config parameters - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, + elif isinstance(peft_scheme, str) and peft_scheme.lower() == "dora": + cfg.peft = VLMLoRA( + target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], + dim=16, + alpha=32, + dora=True, ) else: - raise ValueError(f"Unknown dataset_type '{_dataset_choice}'. Expected one of: 'mock', 'hf', 'preloaded'.") - - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=500, - eval_iters=32, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=False, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - # use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + cfg.peft = peft_scheme - return cfg + # Model configuration + hf_path = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" + cfg.model = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=True).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False -def nemotron_nano_v2_vl_12b_finetune_config( - *, - pretrained_checkpoint: str = "", - lora_on_language_model: bool = False, - lora_on_vision_model: bool = False, - save_checkpoint_dir: Optional[str] = None, - **pretrain_kwargs, -) -> ConfigContainer: - """Create a finetuning configuration for Nemotron Nano V2 VL. - - This helper wraps :func:`nemotron_nano_v2_vl_12b_pretrain_config`, forwarding all keyword arguments to it - while additionally wiring up the :class:`CheckpointConfig` for finetuning from a - given *``pretrained_checkpoint``*. - - Parameters: - pretrained_checkpoint: str - Path to a Megatron-Bridge checkpoint (or a directory produced by - ``convert_ckpt_hf_to_megatron``) that will be loaded before training. - save_checkpoint_dir: str | None, default ``run_output_dir / "checkpoints"`` - Directory where new checkpoints will be saved / resumed from. If not - provided, we reuse the default path chosen by *nemotron_nano_v2_vl_12b_pretrain_config*. - lora_on_language_model: bool = True - Whether to apply PEFT to the language model. - lora_on_vision_model: bool = True - Whether to apply PEFT to the vision model. - **pretrain_kwargs: Any - Additional keyword arguments are forwarded verbatim to - :func:`nemotron_nano_v2_vl_12b_pretrain_config` to customise the base recipe (e.g. batch size, - learning rate, parallelism). - """ + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 - cfg = nemotron_nano_v2_vl_12b_pretrain_config(**pretrain_kwargs) - - # Override Train hyper-parameters suitable for finetuning if the caller did - # not explicitly pass them via **pretrain_kwargs. - if pretrain_kwargs.get("train_iters") is None: - cfg.train.train_iters = 10_000 - if pretrain_kwargs.get("lr") is None and hasattr(cfg.optimizer, "lr"): - cfg.optimizer.lr = 1e-5 # type: ignore[attr-defined] - if pretrain_kwargs.get("min_lr") is None and hasattr(cfg.optimizer, "min_lr"): - cfg.optimizer.min_lr = 1e-6 # type: ignore[attr-defined] - - # Update CheckpointConfig for finetuning. - ckpt_dir = save_checkpoint_dir or cfg.checkpoint.save or cfg.checkpoint.load # type: ignore[attr-defined] - cfg.checkpoint = CheckpointConfig( - pretrained_checkpoint=pretrained_checkpoint, - save=ckpt_dir, - load=ckpt_dir, - ckpt_format=cfg.checkpoint.ckpt_format, # preserve existing choice - fully_parallel_save=cfg.checkpoint.fully_parallel_save, - save_interval=cfg.checkpoint.save_interval, + # Kernel selections + cfg.model.attention_backend = "flash" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 2000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 0 + + # Optimizer - PEFT LR settings + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=5, + lr_decay_iters=None, + max_lr=2e-5, + min_lr=2e-6, ) - if lora_on_language_model: - if lora_on_vision_model: - cfg.peft = VLMLoRA( - target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], - dim=16, - alpha=32, - ) - else: - cfg.peft = VLMLoRA( - target_modules=[ - "*language_model*.linear_qkv", - "*language_model*.linear_proj", - "*language_model*.linear_fc1", - "*language_model*.linear_fc2", - ], - dim=16, - alpha=32, - freeze_vision_model=False, - freeze_vision_projection=False, - ) - - cfg.optimizer.lr = 5e-5 - cfg.optimizer.min_lr = 5e-6 - cfg.model.tensor_model_parallel_size = 2 + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings - Nemotron uses average_in_collective=False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Checkpoint config - override save_interval from common + cfg.checkpoint.save_interval = 200 + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" return cfg diff --git a/src/megatron/bridge/recipes/qwen_vl/__init__.py b/src/megatron/bridge/recipes/qwen_vl/__init__.py index afb6ac1048..4d89c691c3 100644 --- a/src/megatron/bridge/recipes/qwen_vl/__init__.py +++ b/src/megatron/bridge/recipes/qwen_vl/__init__.py @@ -12,24 +12,45 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Qwen3 models +# Qwen2.5-VL models +# Qwen3-VL models from .qwen3_vl import ( - qwen3_vl_8b_finetune_config, - qwen3_vl_8b_pretrain_config, - qwen3_vl_30b_a3b_finetune_config, - qwen3_vl_30b_a3b_pretrain_config, - qwen3_vl_235b_a22b_finetune_config, - qwen3_vl_235b_a22b_pretrain_config, + qwen3_vl_8b_peft_config, + qwen3_vl_8b_sft_config, + qwen3_vl_30b_a3b_peft_config, + qwen3_vl_30b_a3b_sft_config, + qwen3_vl_235b_a22b_peft_config, + qwen3_vl_235b_a22b_sft_config, +) +from .qwen25_vl import ( + qwen25_vl_3b_peft_config, + qwen25_vl_3b_sft_config, + qwen25_vl_7b_peft_config, + qwen25_vl_7b_sft_config, + qwen25_vl_32b_peft_config, + qwen25_vl_32b_sft_config, + qwen25_vl_72b_peft_config, + qwen25_vl_72b_sft_config, ) __all__ = [ - # Qwen3-VL pretrain configs - "qwen3_vl_8b_pretrain_config", - "qwen3_vl_30b_a3b_pretrain_config", - "qwen3_vl_235b_a22b_pretrain_config", - # Qwen3-VL finetune configs (with PEFT support) - "qwen3_vl_8b_finetune_config", - "qwen3_vl_30b_a3b_finetune_config", - "qwen3_vl_235b_a22b_finetune_config", + # Qwen2.5-VL SFT configs + "qwen25_vl_3b_sft_config", + "qwen25_vl_7b_sft_config", + "qwen25_vl_32b_sft_config", + "qwen25_vl_72b_sft_config", + # Qwen2.5-VL PEFT configs + "qwen25_vl_3b_peft_config", + "qwen25_vl_7b_peft_config", + "qwen25_vl_32b_peft_config", + "qwen25_vl_72b_peft_config", + # Qwen3-VL SFT configs + "qwen3_vl_8b_sft_config", + "qwen3_vl_30b_a3b_sft_config", + "qwen3_vl_235b_a22b_sft_config", + # Qwen3-VL PEFT configs + "qwen3_vl_8b_peft_config", + "qwen3_vl_30b_a3b_peft_config", + "qwen3_vl_235b_a22b_peft_config", ] diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py index a7277f256d..8ee637f59b 100644 --- a/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py +++ b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py @@ -12,362 +12,916 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import List, Optional, Union +"""Qwen2.5-VL finetuning recipes with parameterless API. + +This module provides SFT and PEFT configurations for Qwen2.5-VL models (3B, 7B, 32B, 72B). +""" import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.data.vlm_datasets import ( - HFDatasetConversationProvider, - MockVLMConversationProvider, - PreloadedVLMConversationProvider, -) from megatron.bridge.peft.base import PEFT +from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm from megatron.bridge.recipes.utils.finetune_utils import default_peft_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DatasetProvider, - DistributedDataParallelConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig - - -class Qwen25VLCommonKwargs(TypedDict, total=False): - """Typed options accepted by Qwen2.5-VL recipe helper functions.""" - - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - dataset_type: Optional[str] - image_folder: Optional[str] - tokenizer_model: Optional[str] - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - # Freeze options - freeze_language_model: bool - freeze_vision_model: bool - freeze_vision_projection: bool - # Checkpoint options - pretrained_checkpoint: Optional[str] - # PEFT options - peft: Optional[Union[str, PEFT]] - finetune_lr: float - # W&B logging - wandb_project: Optional[str] - wandb_entity: Optional[str] - wandb_exp_name: Optional[str] - - -def qwen25_vl_3b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen2.5-VL 3B Instruct. +from megatron.bridge.training.config import ConfigContainer - Default configuration: 1 node, 8 GPUs - - LoRA/DoRA: TP=1, PP=1, LR=1e-4 - - Full SFT: TP=1, PP=1, LR=5e-6 - See `_qwen25_vl_common` for the full list of parameters. +# ============================================================================= +# Qwen2.5-VL 3B SFT Configuration +# ============================================================================= +def qwen25_vl_3b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen2.5-VL 3B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + cfg = _sft_common_vlm() - recommended_kwargs: Qwen25VLCommonKwargs = { - "hf_path": "Qwen/Qwen2.5-VL-3B-Instruct", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen25_vl_common(**combined_kwargs) + # Model configuration + hf_path = "Qwen/Qwen2.5-VL-3B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" -def qwen25_vl_7b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen2.5-VL 7B Instruct. + return cfg - Default configuration: 1 node, 8 GPUs - - LoRA/DoRA: TP=1, PP=1, LR=1e-4 - - Full SFT: TP=2, PP=1, LR=5e-6 - See `_qwen25_vl_common` for the full list of parameters. +# ============================================================================= +# Qwen2.5-VL 7B SFT Configuration +# ============================================================================= +def qwen25_vl_7b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen2.5-VL 7B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=2, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + cfg = _sft_common_vlm() - recommended_kwargs: Qwen25VLCommonKwargs = { - "hf_path": "Qwen/Qwen2.5-VL-7B-Instruct", - "tensor_model_parallel_size": 2 if is_full_sft else 1, - "pipeline_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen25_vl_common(**combined_kwargs) + # Model configuration + hf_path = "Qwen/Qwen2.5-VL-7B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 -def qwen25_vl_32b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen2.5-VL 32B Instruct. + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" - Default configuration: 2 nodes, 16 GPUs total - - LoRA/DoRA: TP=1, PP=1, LR=1e-4 - - Full SFT: TP=8, PP=2, LR=5e-6 + return cfg + + +# ============================================================================= +# Qwen2.5-VL 32B SFT Configuration +# ============================================================================= +def qwen25_vl_32b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen2.5-VL 32B Instruct. - See `_qwen25_vl_common` for the full list of parameters. + Default configuration: 2 nodes, 16 GPUs total + - TP=8, PP=2 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen2.5-VL-32B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 - recommended_kwargs: Qwen25VLCommonKwargs = { - "hf_path": "Qwen/Qwen2.5-VL-32B-Instruct", - "tensor_model_parallel_size": 8 if is_full_sft else 1, - "pipeline_model_parallel_size": 2 if is_full_sft else 1, - "pipeline_dtype": torch.bfloat16 if is_full_sft else None, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen25_vl_common(**combined_kwargs) + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + return cfg -def qwen25_vl_72b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen2.5-VL 72B Instruct. - Default configuration: 4 nodes, 32 GPUs total - - LoRA/DoRA: TP=1, PP=1, LR=1e-4 - - Full SFT: TP=8, PP=4, LR=5e-6 +# ============================================================================= +# Qwen2.5-VL 72B SFT Configuration +# ============================================================================= +def qwen25_vl_72b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen2.5-VL 72B Instruct. - See `_qwen25_vl_common` for the full list of parameters. + Default configuration: 4 nodes, 32 GPUs total + - TP=8, PP=4 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen25VLCommonKwargs = { - "hf_path": "Qwen/Qwen2.5-VL-72B-Instruct", - "tensor_model_parallel_size": 8 if is_full_sft else 1, - "pipeline_model_parallel_size": 4 if is_full_sft else 1, - "pipeline_dtype": torch.bfloat16 if is_full_sft else None, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - } - combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen25_vl_common(**combined_kwargs) - - -def _qwen25_vl_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "qwen25_vl_finetune", - pretrained_checkpoint: Optional[str] = None, + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen2.5-VL-72B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + # Dataset configuration - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - dataset_type: Optional[str] = None, - image_folder: Optional[str] = None, - tokenizer_model: Optional[str] = None, + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Qwen2.5-VL 3B PEFT Configuration +# ============================================================================= +def qwen25_vl_3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen2.5-VL 3B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + # Model configuration - tensor_model_parallel_size: int = 2, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - # Precision and comm overlap - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Freeze options - freeze_language_model: bool = False, - freeze_vision_model: bool = False, - freeze_vision_projection: bool = False, - # PEFT options - peft: Optional[Union[str, PEFT]] = None, - finetune_lr: Optional[float] = None, - # W&B logging - wandb_project: Optional[str] = None, - wandb_entity: Optional[str] = None, - wandb_exp_name: Optional[str] = None, -) -> ConfigContainer: + hf_path = "Qwen/Qwen2.5-VL-3B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Qwen2.5-VL 7B PEFT Configuration +# ============================================================================= +def qwen25_vl_7b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen2.5-VL 7B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - Create a fine-tuning configuration for Qwen2.5-VL models using a given HuggingFace path. + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen2.5-VL-7B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Qwen2.5-VL 32B PEFT Configuration +# ============================================================================= +def qwen25_vl_32b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen2.5-VL 32B Instruct. - The dataset pipeline is conversation-based. To train multimodal tokens, ensure your - preprocessed data includes placeholders (e.g., ) as needed. + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Build provider via AutoBridge and set parallel/seq params here - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.freeze_language_model = freeze_language_model - model_cfg.freeze_vision_model = freeze_vision_model - model_cfg.freeze_vision_projection = freeze_vision_projection - model_cfg.seq_length = seq_length - - # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr - effective_lr = finetune_lr if finetune_lr is not None else lr - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters, - max_lr=effective_lr, - min_lr=min_lr, + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen2.5-VL-32B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP/PP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg - # PEFT config - peft_config = default_peft_config(peft) - - # Determine dataset selection strategy. - _dataset_choice = dataset_type or "hf" - _processor_model = tokenizer_model or hf_path - - if _dataset_choice == "mock": - dataset_cfg: DatasetProvider = MockVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - prompt="Describe this image.", - num_workers=1, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - create_attention_mask=True, - pad_to_max_length=True, - ) - elif _dataset_choice == "preloaded": - dataset_cfg = PreloadedVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, - valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path, - test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path, - image_folder=image_folder, - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) - elif _dataset_choice == "hf": - dataset_cfg = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) + +# ============================================================================= +# Qwen2.5-VL 72B PEFT Configuration +# ============================================================================= +def qwen25_vl_72b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen2.5-VL 72B Instruct. + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) else: - raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].") - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=eval_interval, - eval_iters=32, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - wandb_project=wandb_project, - wandb_entity=wandb_entity, - wandb_exp_name=wandb_exp_name, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE), - checkpoint=CheckpointConfig( - pretrained_checkpoint=pretrained_checkpoint, - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - peft=peft_config, - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen2.5-VL-72B-Instruct" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP/PP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" return cfg diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py index 5a60e3b8a7..d6b6acda6d 100644 --- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py +++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py @@ -12,540 +12,858 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import List, Optional, Union +"""Qwen3-VL finetuning recipes with parameterless API. + +This module provides SFT and PEFT configurations for Qwen3-VL MoE models (8B, 30B-A3B, 235B-A22B). +""" import torch -from transformers import AutoTokenizer, Qwen3VLProcessor -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.data.vlm_datasets import ( - EnergonProvider, - HFDatasetConversationProvider, - MockVLMConversationProvider, - PreloadedVLMConversationProvider, -) from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import QwenVLTaskEncoder -from megatron.bridge.recipes.utils.finetune_utils import default_peft_config as _default_peft_config +from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm +from megatron.bridge.recipes.utils.finetune_utils import default_peft_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DatasetProvider, - DistributedDataParallelConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) +from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed -class Qwen3VLCommonKwargs(TypedDict, total=False): - """Typed options accepted by Qwen3 VL MoE recipe helpers.""" +# ============================================================================= +# Qwen3-VL 8B SFT Configuration +# ============================================================================= +def qwen3_vl_8b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3-VL 8B (dense model). + + Default configuration: 1 node, 8 GPUs + - TP=2, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 + """ + cfg = _sft_common_vlm() - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - data_paths: Optional[List[str]] - data_args_path: Optional[str] - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - per_split_data_args_path: Optional[str] - mock: bool # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - expert_model_parallel_size: Optional[int] - expert_tensor_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - enable_recompute: bool - account_for_embedding_in_pipeline_split: bool - account_for_loss_in_pipeline_split: bool - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - use_null_tokenizer: bool - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - moe_flex_dispatcher_backend: str | None - # Freeze options - pretrained_checkpoint: Optional[str] - freeze_language_model: bool - freeze_vision_model: bool - freeze_vision_projection: bool + hf_path = "Qwen/Qwen3-VL-8B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (not MoE for 8B) + cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_flex_dispatcher_backend = None + cfg.model.moe_hybridep_num_sms = 16 + + # Apply flex dispatcher backend (will be no-op for non-MoE model) + apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None) + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections (not applicable for dense 8B model) + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = False + cfg.model.moe_grouped_gemm = False + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap (not applicable for dense model) + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance (not applicable for dense model) + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding (not applicable for dense model) + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + # Dataset configuration - dataset_type: Optional[str] - image_folder: Optional[str] - tokenizer_model: Optional[str] - # PEFT options - peft: Optional[Union[str, PEFT]] - finetune_lr: float + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + return cfg -def qwen3_vl_8b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Qwen3-VL 8B Instruct. - See `_qwen3_vl_common` for the full list of parameters. - """ - recommended_kwargs: Qwen3VLCommonKwargs = { - "hf_path": "Qwen/Qwen3-VL-8B-Instruct", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 1, - "expert_model_parallel_size": 1, - "freeze_language_model": True, - "freeze_vision_model": True, - "freeze_vision_projection": False, - } - combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_vl_common(**combined_kwargs) - - -def qwen3_vl_30b_a3b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Qwen3-VL-30B-A3B-Instruct. - - See `_qwen3_vl_common` for the full list of parameters. +# ============================================================================= +# Qwen3-VL 30B-A3B SFT Configuration +# ============================================================================= +def qwen3_vl_30b_a3b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3-VL 30B-A3B (MoE model). + + Default configuration: 4 nodes, 32 GPUs + - TP=1, PP=1, EP=8 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - recommended_kwargs: Qwen3VLCommonKwargs = { - "hf_path": "Qwen/Qwen3-VL-30B-A3B-Instruct", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 8, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_vl_common(**combined_kwargs) - - -def qwen3_vl_235b_a22b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Qwen3-VL-235B-A22B-Instruct. - - See `_qwen3_vl_common` for the full list of parameters. + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3-VL-30B-A3B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 8 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (MoE) + cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_flex_dispatcher_backend = None + cfg.model.moe_hybridep_num_sms = 16 + + # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch) + apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None) + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 50 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 10 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=10, + lr_decay_iters=50, + max_lr=0.00005, + min_lr=0.000005, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Qwen3-VL 235B-A22B SFT Configuration +# ============================================================================= +def qwen3_vl_235b_a22b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3-VL 235B-A22B (MoE model). + + Default configuration: 64 nodes, 512 GPUs + - TP=4, PP=1, EP=32 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - recommended_kwargs: Qwen3VLCommonKwargs = { - "hf_path": "Qwen/Qwen3-VL-235B-A22B-Instruct", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 8, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 8, - "account_for_embedding_in_pipeline_split": True, - "account_for_loss_in_pipeline_split": True, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_vl_common(**combined_kwargs) - - -def qwen3_vl_8b_finetune_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3-VL 8B Instruct. + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3-VL-235B-A22B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 32 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (MoE) + cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_flex_dispatcher_backend = None + cfg.model.moe_hybridep_num_sms = 16 + + # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch) + apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None) + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Qwen3-VL 8B PEFT Configuration +# ============================================================================= +def qwen3_vl_8b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3-VL 8B (dense model). Default configuration: 1 node, 8 GPUs - - LoRA/DoRA: TP=1, PP=1, LR=1e-4 - - Full SFT: TP=4, PP=1, LR=1e-5 + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 - See `_qwen3_vl_common` for the full list of parameters. + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen3VLCommonKwargs = { - "hf_path": "Qwen/Qwen3-VL-8B-Instruct", - "tensor_model_parallel_size": 4 if is_full_sft else 1, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 1e-5 if is_full_sft else 1e-4, - "freeze_language_model": True, - "freeze_vision_model": True, - "freeze_vision_projection": False, - "min_lr": 1e-6, - "lr": 1e-5, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_vl_common(**combined_kwargs) - - -def qwen3_vl_30b_a3b_finetune_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3-VL-30B-A3B-Instruct. - - This is a Mixture-of-Experts model with 128 experts and top-8 routing. - Recommended to use with expert parallelism (EP) for efficient training. + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3-VL-8B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (not MoE for 8B) + cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_flex_dispatcher_backend = None + cfg.model.moe_hybridep_num_sms = 16 + + # Apply flex dispatcher backend (will be no-op for non-MoE model) + apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None) + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections (not applicable for dense 8B model) + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = False + cfg.model.moe_grouped_gemm = False + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap (not applicable for dense model) + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance (not applicable for dense model) + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding (not applicable for dense model) + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Qwen3-VL 30B-A3B PEFT Configuration +# ============================================================================= +def qwen3_vl_30b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3-VL 30B-A3B (MoE model). Default configuration: 1 node, 8 GPUs - - LoRA/DoRA: TP=1, PP=1, EP=8, LR=2e-4 - - Full SFT: TP=1, PP=1, EP=8, LR=2e-5 + - TP=1, PP=1, EP=4 + - LR=1e-4 (PEFT) + - Sequence length: 4096 - See `_qwen3_vl_common` for the full list of parameters. - """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen3VLCommonKwargs = { - "hf_path": "Qwen/Qwen3-VL-30B-A3B-Instruct", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 8, - "peft": peft_value, - "finetune_lr": 2e-5 if is_full_sft else 2e-4, - "freeze_language_model": True, - "freeze_vision_model": True, - "freeze_vision_projection": False, - "min_lr": 2e-6, - "lr": 2e-5, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_vl_common(**combined_kwargs) - - -def qwen3_vl_235b_a22b_finetune_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3-VL-235B-A22B-Instruct. - - This is a Mixture-of-Experts model with 128 experts and top-8 routing. - Recommended to use with expert parallelism (EP) for efficient training. - - Default configuration: 4 nodes, 32 GPUs total - - LoRA/DoRA: TP=1, PP=1, EP=8, LR=2e-4 - - Full SFT: TP=4, PP=1, EP=8, LR=2e-5 - - See `_qwen3_vl_common` for the full list of parameters. + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - # Check if user is doing full SFT or PEFT - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen3VLCommonKwargs = { - "hf_path": "Qwen/Qwen3-VL-235B-A22B-Instruct", - "tensor_model_parallel_size": 4 if is_full_sft else 1, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 8, - "expert_tensor_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 2e-5 if is_full_sft else 2e-4, - "freeze_language_model": True, - "freeze_vision_model": True, - "freeze_vision_projection": False, - "min_lr": 2e-6, - "lr": 2e-5, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_vl_common(**combined_kwargs) - - -def _qwen3_vl_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + # Model configuration - tensor_model_parallel_size: int = 4, - pipeline_model_parallel_size: int = 2, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: Optional[int] = 4, - expert_tensor_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - enable_recompute: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - use_null_tokenizer: bool = False, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = None, - comm_overlap_config: Optional[CommOverlapConfig] = None, - moe_flex_dispatcher_backend: Optional[str] = None, - # Freeze options - pretrained_checkpoint: Optional[str] = None, - freeze_language_model: bool = True, - freeze_vision_model: bool = True, - freeze_vision_projection: bool = False, + hf_path = "Qwen/Qwen3-VL-30B-A3B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower EP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 4 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (MoE) + cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_flex_dispatcher_backend = None + cfg.model.moe_hybridep_num_sms = 16 + + # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch) + apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None) + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + # Dataset configuration - dataset_type: Optional[str] = None, - image_folder: Optional[str] = None, - tokenizer_model: Optional[str] = None, - # PEFT options - peft: Optional[Union[str, PEFT]] = None, - finetune_lr: Optional[float] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Qwen3 MoE models using a given HuggingFace path. + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" + + return cfg + + +# ============================================================================= +# Qwen3-VL 235B-A22B PEFT Configuration +# ============================================================================= +def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3-VL 235B-A22B (MoE model). + + Default configuration: 8 nodes, 64 GPUs + - TP=1, PP=1, EP=16 + - LR=1e-4 (PEFT) + - Sequence length: 4096 Args: - hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen3-30B-A3B", "Qwen/Qwen3-235B-A22B"). - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism to be passed to model_config. - expert_model_parallel_size (Optional[int]): Degree of expert parallelism for MoE. - expert_tensor_parallel_size (int): Expert tensor parallelism for MoE. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - enable_recompute (bool): Whether to enable recompute for memory optimization. - account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split. - account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration. - moe_flex_dispatcher_backend (str | None): Token dispatcher type [deepep, hybridep]. - pretrained_checkpoint (Optional[str]): Path to pretrained checkpoint. - freeze_language_model (bool): Whether to freeze the language model. - freeze_vision_model (bool): Whether to freeze the vision model. - freeze_vision_projection (bool): Whether to freeze the vision projection. - dataset_type (Optional[str]): Type of dataset to use. - image_folder (Optional[str]): Path to image folder. - tokenizer_model (Optional[str]): Path to tokenizer model. - peft (Optional[Union[str, PEFT]]): PEFT configuration (e.g., "lora", "dora", or PEFT object). - finetune_lr (Optional[float]): Learning rate override for fine-tuning. - Returns: - ConfigContainer: Configuration for pre-training. + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.expert_tensor_parallel_size = expert_tensor_parallel_size - model_cfg.sequence_parallel = sequence_parallel - # Freeze options - model_cfg.freeze_language_model = freeze_language_model - model_cfg.freeze_vision_model = freeze_vision_model - model_cfg.freeze_vision_projection = freeze_vision_projection - - apply_flex_dispatcher_backend(model_cfg, moe_flex_dispatcher_backend) - - if precision_config is None: - precision_config = bf16_mixed() - - # MoE-specific pipeline split configurations - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True - - # Add recompute settings for memory optimization (used by some MoE models) - if enable_recompute: - model_cfg.recompute_granularity = "full" - model_cfg.recompute_method = "uniform" - model_cfg.recompute_num_layers = 1 - model_cfg.seq_length = seq_length - model_cfg.cross_entropy_fusion_impl = "te" - - # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr - effective_lr = finetune_lr if finetune_lr is not None else lr - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters, - max_lr=effective_lr, - min_lr=min_lr, - ) + cfg = _peft_common_vlm() - # PEFT config - peft_config = _default_peft_config(peft) - - # Determine dataset selection strategy. - _processor_model = tokenizer_model or hf_path - _dataset_choice = dataset_type or ("mock" if mock else "hf") - - if _dataset_choice == "mock": - dataset_cfg: DatasetProvider = MockVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - prompt="Describe this image.", - num_workers=1, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - create_attention_mask=True, - pad_to_max_length=True, - ) - elif _dataset_choice == "preloaded": - dataset_cfg = PreloadedVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, - valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path, - test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path, - image_folder=image_folder, - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) - elif _dataset_choice == "hf": - dataset_cfg = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) - elif _dataset_choice == "energon": - tokenizer = AutoTokenizer.from_pretrained(_processor_model) - # Use Qwen3VLProcessor to match the HF flow (which uses AutoProcessor). - # This processor accepts both images and videos kwargs. - image_processor = Qwen3VLProcessor.from_pretrained(_processor_model) - - dataset_cfg = EnergonProvider( - seq_length=seq_length, - path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, - micro_batch_size=micro_batch_size, - global_batch_size=global_batch_size, - num_workers=2, - dataloader_type="external", - task_encoder=QwenVLTaskEncoder( - tokenizer=tokenizer, - image_processor=image_processor, - max_padding_length=seq_length, - min_pixels=200704, - max_pixels=1003520, - ), - ) + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) else: - raise ValueError( - f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf', 'energon']." - ) - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=eval_interval, - eval_iters=32, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, # qwen3_vl does not support overlap_grad_reduce=True in current implementation - overlap_param_gather=False, # qwen3_vl does not support overlap_param_gather=True in current implementation - average_in_collective=True, # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP - data_parallel_sharding_strategy="optim_grads_params", # For Megatron FSDP only - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - pretrained_checkpoint=pretrained_checkpoint, - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - peft=peft_config, - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3-VL-235B-A22B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower EP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 16 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # Token dispatcher settings (MoE) + cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_flex_dispatcher_backend = None + cfg.model.moe_hybridep_num_sms = 16 + + # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch) + apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None) + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + # cfg.mixed_precision.fp8_recipe = None + # cfg.mixed_precision.fp8 = False + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + # Uncomment below to use a pretrained checkpoint + # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" return cfg diff --git a/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py index c691a30d22..98aa8e2235 100644 --- a/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py @@ -17,7 +17,7 @@ import pytest from megatron.bridge.recipes.gemma3_vl.gemma3_vl import ( - gemma3_vl_4b_finetune_config, + gemma3_vl_4b_sft_config, ) from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test @@ -25,8 +25,8 @@ GEMMA3_VL_FINETUNE_RECIPES = [ # Small model, only use 2 layers ( - gemma3_vl_4b_finetune_config, - "gemma3_vl_4b", + gemma3_vl_4b_sft_config, + "gemma3_vl_4b_sft", {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2}, ), ] @@ -34,8 +34,8 @@ GEMMA3_VL_FINETUNE_PACKED_RECIPES = [ # Small model with packed sequences, only use 2 layers ( - gemma3_vl_4b_finetune_config, - "gemma3_vl_4b_packed", + gemma3_vl_4b_sft_config, + "gemma3_vl_4b_sft_packed", {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2}, {"pack_sequences_in_batch": True}, ), diff --git a/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py b/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py index 00a341f33f..56ca5bb8e4 100644 --- a/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py @@ -12,14 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Functional smoke tests for Ministral 3 recipe configurations.""" - -from functools import partial +"""Functional smoke tests for GLM-4.5V recipe configurations.""" import pytest from megatron.bridge.recipes.glm_vl.glm_45v import ( - glm_45v_finetune_config, + glm_45v_sft_config, ) from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test @@ -27,8 +25,8 @@ GLM_45V_FINETUNE_RECIPES = [ # Small model, only use 2 layers for quick functional test ( - partial(glm_45v_finetune_config, peft=None), - "glm_45v", + glm_45v_sft_config, + "glm_45v_sft", { "tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, @@ -46,8 +44,8 @@ GLM_45V_FINETUNE_PACKED_RECIPES = [ # Small model with packed sequences, only use 2 layers ( - partial(glm_45v_finetune_config, peft=None), - "glm_45v_packed", + glm_45v_sft_config, + "glm_45v_sft_packed", { "tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, diff --git a/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py b/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py index fa9dedb4c0..8274d50dc2 100644 --- a/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py @@ -14,12 +14,10 @@ """Functional smoke tests for Ministral 3 recipe configurations.""" -from functools import partial - import pytest from megatron.bridge.recipes.ministral3.ministral3 import ( - ministral3_3b_finetune_config, + ministral3_3b_sft_config, ) from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test @@ -27,8 +25,8 @@ MINISTRAL3_FINETUNE_RECIPES = [ # Small model, only use 2 layers for quick functional test ( - partial(ministral3_3b_finetune_config, peft=None), - "ministral3_3b", + ministral3_3b_sft_config, + "ministral3_3b_sft", {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2}, ), ] @@ -36,8 +34,8 @@ MINISTRAL3_FINETUNE_PACKED_RECIPES = [ # Small model with packed sequences, only use 2 layers ( - partial(ministral3_3b_finetune_config, peft=None), - "ministral3_3b_packed", + ministral3_3b_sft_config, + "ministral3_3b_sft_packed", {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2}, {"pack_sequences_in_batch": True}, ), diff --git a/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py index 55b76dbf96..1ce299315d 100644 --- a/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py @@ -14,30 +14,20 @@ """Functional smoke tests for Nemotron Nano V2 VL recipe configurations.""" -import functools - import pytest -from megatron.bridge.recipes.nemotron_vl import nemotron_nano_v2_vl as nemotron_recipe +from megatron.bridge.recipes.nemotron_vl.nemotron_nano_v2_vl import ( + nemotron_nano_v2_vl_12b_sft_config, +) from megatron.bridge.training import llava_step from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test -def _finetune_wrapper(**kwargs): - """Wrapper to adapt Nemotron VL finetune_config to the test runner signature. - - The runner will pass (dir, name, dataset_type=mock) among others; we forward - everything to finetune_config and inject a dummy pretrained_checkpoint. - """ - kwargs.setdefault("pretrained_checkpoint", "/tmp/fake_nemotron_vl_ckpt") - return nemotron_recipe.nemotron_nano_v2_vl_12b_finetune_config(**kwargs) - - NEMOTRON_VL_FINETUNE_RECIPES = [ # Small model, only use 2 layers ( - functools.partial(_finetune_wrapper, hf_model_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"), - "nemotron_vl_nano_v2", + nemotron_nano_v2_vl_12b_sft_config, + "nemotron_vl_nano_v2_sft", { "num_layers": 3, "hybrid_override_pattern": "M*-", diff --git a/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py index f652bebe12..08f5f02c18 100644 --- a/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py @@ -27,7 +27,7 @@ import pytest -from megatron.bridge.recipes.qwen_vl.qwen3_vl import qwen3_vl_8b_finetune_config +from megatron.bridge.recipes.qwen_vl.qwen3_vl import qwen3_vl_8b_sft_config from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test @@ -36,14 +36,14 @@ # Qwen3-VL 8B finetune - uses TP=2 for 2-GPU CI # Note: deepstack_visual_indexes must have len <= num_layers ( - qwen3_vl_8b_finetune_config, - "qwen3_vl_8b_finetune", + qwen3_vl_8b_sft_config, + "qwen3_vl_8b_sft", {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1}, {"num_layers": 4, "deepstack_visual_indexes": [0, 1, 2]}, ), ( - qwen3_vl_8b_finetune_config, - "qwen3_vl_8b_finetune", + qwen3_vl_8b_sft_config, + "qwen3_vl_8b_sft", { "tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1, @@ -60,8 +60,8 @@ }, ), ( - qwen3_vl_8b_finetune_config, - "qwen3_vl_8b_finetune", + qwen3_vl_8b_sft_config, + "qwen3_vl_8b_sft", { "tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1, @@ -77,8 +77,8 @@ # (config_func, recipe_name, parallelism_overrides, model_overrides, dataset_overrides) # Qwen3-VL 8B finetune with packed sequences ( - qwen3_vl_8b_finetune_config, - "qwen3_vl_8b_finetune_packed", + qwen3_vl_8b_sft_config, + "qwen3_vl_8b_sft_packed", {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1}, {"num_layers": 4, "deepstack_visual_indexes": [0, 1, 2]}, {"pack_sequences_in_batch": True}, diff --git a/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py index 3dd20a924c..3462ccd98f 100644 --- a/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py @@ -16,15 +16,15 @@ import pytest -from megatron.bridge.recipes.qwen_vl.qwen25_vl import qwen25_vl_3b_finetune_config +from megatron.bridge.recipes.qwen_vl.qwen25_vl import qwen25_vl_3b_sft_config from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test QWEN_VL_PRETRAIN_RECIPES = [ - # (config_func, name, parallelism_overrides) + # (config_func, name, parallelism_overrides, model_overrides) # Two-GPU TP for local/CI multi-GPU runs ( - qwen25_vl_3b_finetune_config, + qwen25_vl_3b_sft_config, "qwen25_vl_3b", {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1}, {"num_layers": 2}, @@ -35,7 +35,7 @@ # (config_func, name, parallelism_overrides, model_overrides, dataset_overrides) # Two-GPU TP with packed sequences ( - qwen25_vl_3b_finetune_config, + qwen25_vl_3b_sft_config, "qwen25_vl_3b_packed", {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1}, {"num_layers": 2}, diff --git a/tests/functional_tests/recipes/utils.py b/tests/functional_tests/recipes/utils.py index 97dbc6fae9..d62e8b203b 100644 --- a/tests/functional_tests/recipes/utils.py +++ b/tests/functional_tests/recipes/utils.py @@ -213,7 +213,8 @@ def run_pretrain_vl_recipe_test( megatron.bridge.training.vlm_step.forward_step. Args: - config_func: The recipe's pretrain_config function + config_func: The recipe's config function (parameterless API for SFT, + or takes peft_scheme parameter for PEFT) recipe_name: Name of the recipe for logging/debugging tmp_path: Temporary directory for test outputs tensor_model_parallel_size: Override tensor parallelism (None = use recipe default) @@ -221,6 +222,8 @@ def run_pretrain_vl_recipe_test( model_overrides: Optional mapping of model attribute overrides to apply dataset_overrides: Optional mapping of dataset attribute overrides to apply """ + from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider + if forward_step_func is None: # Import locally to avoid loading VLM stack for non-VL tests from megatron.bridge.training.vlm_step import forward_step as vlm_forward_step @@ -228,13 +231,20 @@ def run_pretrain_vl_recipe_test( vlm_forward_step = forward_step_func initialize_distributed() - shared_base_dir = broadcast_path(tmp_path) + shared_base_dir = Path(broadcast_path(tmp_path)) try: - # Note: qwen_vl recipe config functions do not support 'mock' kwarg - config: ConfigContainer = config_func( - dir=str(shared_base_dir), name=f"{recipe_name}_functional_test", dataset_type="mock" - ) + # VLM recipe configs use parameterless API - call without arguments + config: ConfigContainer = config_func() + + # Set up output directories after instantiation + run_output_dir = shared_base_dir / f"{recipe_name}_functional_test" + checkpoint_dir = run_output_dir / "checkpoints" + tensorboard_dir = run_output_dir / "tb_logs" + config.checkpoint.save = str(checkpoint_dir) + config.checkpoint.load = str(checkpoint_dir) + config.logger.tensorboard_dir = str(tensorboard_dir) + # Keep runs short and consistent across tests config.train.train_iters = 10 config.validation.eval_interval = 5 @@ -245,31 +255,26 @@ def run_pretrain_vl_recipe_test( config.scheduler.lr_warmup_iters = 1 test_seq_length = 1024 config.model.seq_length = test_seq_length - config.dataset.seq_length = test_seq_length - # Disable pin-memory and worker persistence in tests to avoid - # pin-memory device mismatches under torchrun+pytest environments. - config.dataset.pin_memory = False - config.dataset.num_workers = 0 - config.dataset.persistent_workers = False - - train_samples_needed = config.train.train_iters * config.train.global_batch_size - eval_samples_needed = config.validation.eval_iters * config.train.global_batch_size - test_samples_needed = 8 - - total_samples = train_samples_needed + eval_samples_needed + test_samples_needed - - # Set dataset split ratios for minimal dataset - train_split = train_samples_needed / total_samples - valid_split = eval_samples_needed / total_samples - test_split = test_samples_needed / total_samples - - config.dataset.split = [train_split, valid_split, test_split] + # Get the HF processor path from the original dataset config before replacing + hf_processor_path = getattr(config.dataset, "hf_processor_path", None) + pack_sequences_in_batch = getattr(config.dataset, "pack_sequences_in_batch", False) + + # Replace the real dataset with a mock dataset provider for tests + # MockVLMConversationProvider generates synthetic data and doesn't need a split attribute + # since the DatasetBuildContext calculates sample counts from training configuration + config.dataset = MockVLMConversationProvider( + seq_length=test_seq_length, + hf_processor_path=hf_processor_path, + pack_sequences_in_batch=pack_sequences_in_batch, + ) if tensor_model_parallel_size is not None: - config.model.tensor_model_parallel_size = tensor_model_parallel_size + if hasattr(config.model, "tensor_model_parallel_size"): + config.model.tensor_model_parallel_size = tensor_model_parallel_size if pipeline_model_parallel_size is not None: - config.model.pipeline_model_parallel_size = pipeline_model_parallel_size + if hasattr(config.model, "pipeline_model_parallel_size"): + config.model.pipeline_model_parallel_size = pipeline_model_parallel_size # Apply any model-specific overrides provided by the caller if model_overrides: @@ -281,7 +286,7 @@ def run_pretrain_vl_recipe_test( for attribute_name, attribute_value in dataset_overrides.items(): setattr(config.dataset, attribute_name, attribute_value) - if config.dataset.pack_sequences_in_batch: + if hasattr(config.dataset, "pack_sequences_in_batch") and config.dataset.pack_sequences_in_batch: config.train.micro_batch_size = 2 pretrain(config, vlm_forward_step) diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py new file mode 100644 index 0000000000..1bd925b4d0 --- /dev/null +++ b/tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py @@ -0,0 +1,457 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Test purpose: +# - Parametrize over all exported Qwen2.5-VL recipe functions in `megatron.bridge.recipes.qwen_vl.qwen25_vl`. +# - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O. +# - Build a config and assert it forms a valid `ConfigContainer`. +# - Verify dataset provider selection and sanity-check parallelism fields. +# + +import importlib +from typing import Callable + +import pytest +import torch + + +_qwen25_vl_module = importlib.import_module("megatron.bridge.recipes.qwen_vl.qwen25_vl") + +# SFT configs (parameterless) +_QWEN25_VL_SFT_FUNCS = [ + _qwen25_vl_module.qwen25_vl_3b_sft_config, + _qwen25_vl_module.qwen25_vl_7b_sft_config, + _qwen25_vl_module.qwen25_vl_32b_sft_config, + _qwen25_vl_module.qwen25_vl_72b_sft_config, +] + +# PEFT configs (take peft_scheme parameter) +_QWEN25_VL_PEFT_FUNCS = [ + _qwen25_vl_module.qwen25_vl_3b_peft_config, + _qwen25_vl_module.qwen25_vl_7b_peft_config, + _qwen25_vl_module.qwen25_vl_32b_peft_config, + _qwen25_vl_module.qwen25_vl_72b_peft_config, +] + + +class _FakeModelCfg: + """Fake model configuration for testing.""" + + def __init__(self): + self.tensor_model_parallel_size = 1 + self.pipeline_model_parallel_size = 1 + self.pipeline_dtype = None + self.virtual_pipeline_model_parallel_size = None + self.context_parallel_size = 1 + self.sequence_parallel = False + self.seq_length = 64 + self.freeze_language_model = False + self.freeze_vision_model = False + self.freeze_vision_projection = False + + def finalize(self): + return None + + +class _FakeAutoBridge: + """Fake AutoBridge for testing.""" + + @staticmethod + def from_hf_pretrained(hf_path: str): + """Mock from_hf_pretrained method.""" + return _FakeAutoBridge() + + def to_megatron_provider(self, load_weights: bool = False): + """Return a fake model config.""" + return _FakeModelCfg() + + +def _assert_basic_config(cfg): + """Assert that a config has all required components.""" + from megatron.bridge.training.config import ConfigContainer + + assert isinstance(cfg, ConfigContainer) + assert cfg.model is not None + assert cfg.train is not None + assert cfg.optimizer is not None + assert cfg.scheduler is not None + assert cfg.dataset is not None + assert cfg.logger is not None + assert cfg.tokenizer is not None + assert cfg.checkpoint is not None + assert cfg.rng is not None + + assert cfg.train.global_batch_size >= 1 + assert cfg.train.micro_batch_size >= 1 + assert cfg.dataset.seq_length >= 1 + + +@pytest.mark.parametrize("recipe_func", _QWEN25_VL_SFT_FUNCS) +def test_each_qwen25_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Qwen2.5-VL SFT recipe function builds a valid configuration.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func() + + _assert_basic_config(cfg) + + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 + + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") + + assert cfg.peft is None + + +@pytest.mark.parametrize("recipe_func", _QWEN25_VL_PEFT_FUNCS) +def test_each_qwen25_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Qwen2.5-VL PEFT recipe function builds a valid configuration.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func() + + _assert_basic_config(cfg) + + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 + + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") + + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") + + +@pytest.mark.parametrize("recipe_func", _QWEN25_VL_PEFT_FUNCS) +@pytest.mark.parametrize("peft_scheme", ["lora", "dora"]) +def test_qwen25_vl_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch): + """Test that different PEFT schemes are correctly applied for Qwen2.5-VL models.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func(peft_scheme=peft_scheme) + + _assert_basic_config(cfg) + + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") + + +def test_qwen25_vl_3b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 3B SFT has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + + +def test_qwen25_vl_3b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 3B LoRA has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config(peft_scheme="lora") + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 32 + + +def test_qwen25_vl_3b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 3B DoRA has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config(peft_scheme="dora") + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 64 + + +def test_qwen25_vl_7b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 7B SFT has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_7b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + + +def test_qwen25_vl_7b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 7B PEFT has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_7b_peft_config() + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + + +def test_qwen25_vl_32b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 32B SFT has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_32b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 8 + assert cfg.model.pipeline_model_parallel_size == 2 + assert cfg.model.pipeline_dtype == torch.bfloat16 + assert cfg.peft is None + + +def test_qwen25_vl_32b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 32B PEFT has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_32b_peft_config() + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + + +def test_qwen25_vl_72b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 72B SFT has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_72b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 8 + assert cfg.model.pipeline_model_parallel_size == 4 + assert cfg.model.pipeline_dtype == torch.bfloat16 + assert cfg.peft is None + + +def test_qwen25_vl_72b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 72B PEFT has correct default parallelism.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_72b_peft_config() + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + + +def test_qwen25_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs use HFDatasetConversationProvider by default.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider + + assert isinstance(cfg.dataset, HFDatasetConversationProvider) + + +def test_qwen25_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs use HFDatasetConversationProvider by default.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config() + + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider + + assert isinstance(cfg.dataset, HFDatasetConversationProvider) + + +def test_qwen25_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs have freeze options set to False by default.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_qwen25_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs have freeze options set to False by default.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config() + + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_qwen25_vl_precision_config(monkeypatch: pytest.MonkeyPatch): + """Test that precision config is correctly set.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.mixed_precision == "bf16_mixed" + + +def test_qwen25_vl_ddp_config(monkeypatch: pytest.MonkeyPatch): + """Test that DDP config is correctly set for VLMs.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.ddp.overlap_grad_reduce is False + assert cfg.ddp.overlap_param_gather is False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is True + + +def test_qwen25_vl_optimizer_precision_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that optimizer precision settings are correctly configured.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.optimizer.use_precision_aware_optimizer is False + assert cfg.optimizer.main_grads_dtype == torch.float32 + assert cfg.optimizer.main_params_dtype == torch.float32 + assert cfg.optimizer.exp_avg_dtype == torch.float32 + assert cfg.optimizer.exp_avg_sq_dtype == torch.float32 + + +def test_qwen25_vl_training_config(monkeypatch: pytest.MonkeyPatch): + """Test that training configuration is correctly set.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.train.train_iters == 300000 + assert cfg.train.global_batch_size == 32 + assert cfg.train.micro_batch_size == 2 + assert cfg.train.manual_gc is True + assert cfg.train.manual_gc_interval == 100 + + +def test_qwen25_vl_validation_config(monkeypatch: pytest.MonkeyPatch): + """Test that validation configuration is correctly set.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.validation.eval_interval == 500 + assert cfg.validation.eval_iters == 32 + + +def test_qwen25_vl_sft_learning_rate(monkeypatch: pytest.MonkeyPatch): + """Test that SFT has lower learning rate than PEFT.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + sft_cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + peft_cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config() + + # SFT should have lower LR (5e-6) compared to PEFT (1e-4) + assert sft_cfg.optimizer.lr < peft_cfg.optimizer.lr + + +def test_qwen25_vl_kernel_settings(monkeypatch: pytest.MonkeyPatch): + """Test that kernel settings are correctly configured.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.attention_backend == "auto" + assert cfg.model.cross_entropy_loss_fusion is True + assert cfg.model.cross_entropy_fusion_impl == "native" + + +def test_qwen25_vl_cuda_graph_settings(monkeypatch: pytest.MonkeyPatch): + """Test that CUDA graph settings are correctly configured.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.cuda_graph_impl == "none" + assert cfg.model.cuda_graph_scope == "full" + assert cfg.model.cuda_graph_warmup_steps == 3 + + +def test_qwen25_vl_transformer_impl(monkeypatch: pytest.MonkeyPatch): + """Test that transformer implementation is set correctly.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.transformer_impl == "transformer_engine" + + +def test_qwen25_vl_memory_saving_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that memory saving settings are disabled by default.""" + monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config() + + _assert_basic_config(cfg) + + assert cfg.model.recompute_granularity is None + assert cfg.model.recompute_modules is None + assert cfg.model.fine_grained_activation_offloading is False + assert cfg.model.offload_modules is None diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py index 1ba00b910f..0bf6698ec6 100644 --- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py +++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py @@ -12,88 +12,84 @@ # See the License for the specific language governing permissions and # limitations under the License. +# +# Test purpose: +# - Parametrize over all exported Qwen3-VL recipe functions in `megatron.bridge.recipes.qwen_vl.qwen3_vl`. +# - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O. +# - Build a config and assert it forms a valid `ConfigContainer`. +# - Verify dataset provider selection and sanity-check parallelism fields. +# - Test MoE-specific settings for Qwen3-VL MoE models. +# + import importlib -from typing import Callable, List +from typing import Callable import pytest _qwen3_vl_module = importlib.import_module("megatron.bridge.recipes.qwen_vl.qwen3_vl") +# SFT configs (parameterless) +_QWEN3_VL_SFT_FUNCS = [ + _qwen3_vl_module.qwen3_vl_8b_sft_config, + _qwen3_vl_module.qwen3_vl_30b_a3b_sft_config, + _qwen3_vl_module.qwen3_vl_235b_a22b_sft_config, +] -def _collect_recipe_functions(mod) -> List[Callable]: - # Prefer explicit exports - exported_names = getattr(mod, "__all__", None) - candidates: List[Callable] = [] - - if exported_names: - for name in exported_names: - fn = getattr(mod, name, None) - if callable(fn) and (name.endswith("_config") or "qwen3" in name.lower() or "qwen" in name.lower()): - candidates.append(fn) - else: - # Fallback: discover by convention - for name in dir(mod): - if name.startswith("_"): - continue - fn = getattr(mod, name, None) - if callable(fn) and name.endswith("_config"): - candidates.append(fn) - - # De-dupe while preserving order - seen = set() - unique = [] - for fn in candidates: - if fn.__name__ not in seen: - unique.append(fn) - seen.add(fn.__name__) - return unique - - -_QWEN3_VL_RECIPE_FUNCS: List[Callable] = _collect_recipe_functions(_qwen3_vl_module) - - -def _safe_overrides_for(name: str) -> dict: - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 5, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "mock": True, - "lr": 1e-4, - "use_null_tokenizer": True, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - - return overrides +# PEFT configs (take peft_scheme parameter) +_QWEN3_VL_PEFT_FUNCS = [ + _qwen3_vl_module.qwen3_vl_8b_peft_config, + _qwen3_vl_module.qwen3_vl_30b_a3b_peft_config, + _qwen3_vl_module.qwen3_vl_235b_a22b_peft_config, +] class _FakeModelCfg: + """Fake model configuration for testing.""" + def __init__(self): - self.cross_entropy_fusion_impl = "te" + # Set default attributes that recipes might set + self.tensor_model_parallel_size = 1 + self.pipeline_model_parallel_size = 1 + self.pipeline_dtype = None + self.virtual_pipeline_model_parallel_size = None + self.context_parallel_size = 1 + self.expert_model_parallel_size = 1 + self.sequence_parallel = False + self.seq_length = 64 + self.freeze_language_model = False + self.freeze_vision_model = False + self.freeze_vision_projection = False + # MoE-specific + self.moe_token_dispatcher_type = None + self.moe_flex_dispatcher_backend = None + self.moe_hybridep_num_sms = None + self.moe_router_fusion = False + self.moe_permute_fusion = False + self.moe_grouped_gemm = False + self.moe_router_padding_for_fp8 = False + self.moe_shared_expert_overlap = False + self.moe_router_force_load_balancing = False def finalize(self): return None -class _FakeBridge: - def __init__(self): - pass +class _FakeAutoBridge: + """Fake AutoBridge for testing.""" + + @staticmethod + def from_hf_pretrained(hf_path: str): + """Mock from_hf_pretrained method.""" + return _FakeAutoBridge() def to_megatron_provider(self, load_weights: bool = False): + """Return a fake model config.""" return _FakeModelCfg() - @staticmethod - def from_hf_pretrained(hf_path: str, **kwargs): - return _FakeBridge() - def _assert_basic_config(cfg): + """Assert that a config has all required components.""" from megatron.bridge.training.config import ConfigContainer assert isinstance(cfg, ConfigContainer) @@ -109,30 +105,342 @@ def _assert_basic_config(cfg): assert cfg.train.global_batch_size >= 1 assert cfg.train.micro_batch_size >= 1 + assert cfg.dataset.seq_length >= 1 + + +@pytest.mark.parametrize("recipe_func", _QWEN3_VL_SFT_FUNCS) +def test_each_qwen3_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Qwen3-VL SFT recipe function builds a valid configuration.""" + # Monkeypatch AutoBridge to return a fake model config + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func() - # Different dataset configs may expose length as sequence_length or seq_length; - # for multimodal datasets there may be no such attribute. Only assert presence when available. - if hasattr(cfg.dataset, "sequence_length"): - assert cfg.dataset.sequence_length >= 1 - elif hasattr(cfg.dataset, "seq_length"): - assert cfg.dataset.seq_length >= 1 - else: - assert cfg.dataset is not None + _assert_basic_config(cfg) + + # Check that NullTokenizer is used + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Verify parallelism settings + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 -@pytest.mark.parametrize("recipe_func", _QWEN3_VL_RECIPE_FUNCS) -def test_each_qwen3_vl_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): - # Monkeypatch AutoBridge used inside the recipe module to avoid heavyweight init - module_name = recipe_func.__module__ - mod = importlib.import_module(module_name) - monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) + # Verify freeze settings are set + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") - overrides = _safe_overrides_for(recipe_func.__name__) + # SFT configs should not have PEFT + assert cfg.peft is None - cfg = recipe_func(**overrides) + +@pytest.mark.parametrize("recipe_func", _QWEN3_VL_PEFT_FUNCS) +def test_each_qwen3_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Qwen3-VL PEFT recipe function builds a valid configuration.""" + # Monkeypatch AutoBridge to return a fake model config + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func() # Default peft_scheme="lora" _assert_basic_config(cfg) - # Minimal sanity checks on parallelism fields being set to sane values + # Check that NullTokenizer is used + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + + # Verify parallelism settings assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 + + # Verify freeze settings are set + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") + + # PEFT configs should have PEFT configured + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") + + +@pytest.mark.parametrize("recipe_func", _QWEN3_VL_PEFT_FUNCS) +@pytest.mark.parametrize("peft_scheme", ["lora", "dora"]) +def test_qwen3_vl_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch): + """Test that different PEFT schemes are correctly applied for Qwen3-VL models.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func(peft_scheme=peft_scheme) + + _assert_basic_config(cfg) + + # Check PEFT config presence + assert cfg.peft is not None + # Verify PEFT config has expected attributes + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") + + +def test_qwen3_vl_8b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 8B SFT has correct default parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config() + + _assert_basic_config(cfg) + + # For full SFT, 8B should use TP=2, PP=1 + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + + +def test_qwen3_vl_8b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 8B LoRA has correct default parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config(peft_scheme="lora") + + _assert_basic_config(cfg) + + # For LoRA, 8B should use TP=1, PP=1 + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + + # Check PEFT config + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 32 + + +def test_qwen3_vl_8b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 8B DoRA has correct default parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config(peft_scheme="dora") + + _assert_basic_config(cfg) + + # For DoRA, 8B should use same parallelism as LoRA + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + + # Check PEFT config (DoRA has alpha=64 by default, unlike LoRA's alpha=32) + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 64 + + +def test_qwen3_vl_30b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 30B-A3B SFT has correct default parallelism and MoE settings.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_30b_a3b_sft_config() + + _assert_basic_config(cfg) + + # For full SFT, 30B-A3B should use TP=1, PP=1, EP=8 + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + + # Check expert_model_parallel_size for MoE model + assert cfg.model.expert_model_parallel_size == 8 + + +def test_qwen3_vl_30b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 30B-A3B PEFT has correct default parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_30b_a3b_peft_config() + + _assert_basic_config(cfg) + + # For LoRA, 30B-A3B should use TP=1, PP=1, EP=4 + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 4 + + # Check PEFT config + assert cfg.peft is not None + + +def test_qwen3_vl_235b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 235B-A22B SFT has correct default parallelism and MoE settings.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_235b_a22b_sft_config() + + _assert_basic_config(cfg) + + # For full SFT, 235B-A22B should use TP=4, PP=1, EP=32 + assert cfg.model.tensor_model_parallel_size == 4 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + + # Check expert_model_parallel_size for MoE model + assert cfg.model.expert_model_parallel_size == 32 + + +def test_qwen3_vl_235b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 235B-A22B PEFT has correct default parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_235b_a22b_peft_config() + + _assert_basic_config(cfg) + + # For LoRA, 235B-A22B should use TP=1, PP=1, EP=16 + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 16 + + # Check PEFT config + assert cfg.peft is not None + + +def test_qwen3_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs use HFDatasetConversationProvider by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config() + + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider + + assert isinstance(cfg.dataset, HFDatasetConversationProvider) + + +def test_qwen3_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs use HFDatasetConversationProvider by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config() + + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider + + assert isinstance(cfg.dataset, HFDatasetConversationProvider) + + +def test_qwen3_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs have freeze options set to False by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config() + + # Default freeze options should be False for full SFT + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_qwen3_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs have freeze options set to False by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config() + + # Default freeze options should be False for PEFT + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_qwen3_vl_precision_config(monkeypatch: pytest.MonkeyPatch): + """Test that precision config is correctly set.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config() + + _assert_basic_config(cfg) + + # Default should be bf16_mixed + assert cfg.mixed_precision == "bf16_mixed" + + +def test_qwen3_vl_ddp_config(monkeypatch: pytest.MonkeyPatch): + """Test that DDP config is correctly set for VLMs.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config() + + _assert_basic_config(cfg) + + # VLMs should have overlap disabled + assert cfg.ddp.overlap_grad_reduce is False + assert cfg.ddp.overlap_param_gather is False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is True + + +def test_qwen3_vl_moe_settings_30b(monkeypatch: pytest.MonkeyPatch): + """Test that MoE-specific settings are correctly configured for 30B-A3B model.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_30b_a3b_sft_config() + + _assert_basic_config(cfg) + + # Check MoE-specific settings + assert hasattr(cfg.model, "moe_token_dispatcher_type") + assert hasattr(cfg.model, "moe_flex_dispatcher_backend") + assert hasattr(cfg.model, "moe_hybridep_num_sms") + assert hasattr(cfg.model, "moe_router_fusion") + assert hasattr(cfg.model, "moe_permute_fusion") + assert hasattr(cfg.model, "moe_grouped_gemm") + assert hasattr(cfg.model, "moe_router_padding_for_fp8") + assert hasattr(cfg.model, "moe_shared_expert_overlap") + assert hasattr(cfg.model, "moe_router_force_load_balancing") + + +def test_qwen3_vl_moe_settings_235b(monkeypatch: pytest.MonkeyPatch): + """Test that MoE-specific settings are correctly configured for 235B-A22B model.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_235b_a22b_sft_config() + + _assert_basic_config(cfg) + + # Check MoE-specific settings + assert hasattr(cfg.model, "moe_token_dispatcher_type") + assert hasattr(cfg.model, "moe_flex_dispatcher_backend") + assert hasattr(cfg.model, "moe_hybridep_num_sms") + assert hasattr(cfg.model, "moe_router_fusion") + assert hasattr(cfg.model, "moe_permute_fusion") + assert hasattr(cfg.model, "moe_grouped_gemm") + assert hasattr(cfg.model, "moe_router_padding_for_fp8") + assert hasattr(cfg.model, "moe_shared_expert_overlap") + assert hasattr(cfg.model, "moe_router_force_load_balancing") + + +def test_qwen3_vl_8b_is_dense_model(monkeypatch: pytest.MonkeyPatch): + """Test that 8B is a dense model without MoE-specific parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config() + + _assert_basic_config(cfg) + + # 8B should be dense model with EP=1 + assert cfg.model.expert_model_parallel_size == 1 + + # Verify dense model kernel settings + assert cfg.model.moe_router_fusion is False + assert cfg.model.moe_permute_fusion is False + assert cfg.model.moe_grouped_gemm is False diff --git a/tests/unit_tests/recipes/test_gemma3_vl_recipes.py b/tests/unit_tests/recipes/test_gemma3_vl_recipes.py index bbe29bb3ce..d9707a634a 100644 --- a/tests/unit_tests/recipes/test_gemma3_vl_recipes.py +++ b/tests/unit_tests/recipes/test_gemma3_vl_recipes.py @@ -16,7 +16,7 @@ # Test purpose: # - Parametrize over all exported Gemma3-VL recipe functions in `megatron.bridge.recipes.gemma3_vl.gemma3_vl`. # - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O. -# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`. +# - Build a config and assert it forms a valid `ConfigContainer`. # - Verify dataset provider selection and sanity-check parallelism fields. # @@ -28,42 +28,23 @@ _gemma3_vl_module = importlib.import_module("megatron.bridge.recipes.gemma3_vl.gemma3_vl") -_GEMMA3_VL_RECIPE_FUNCS = [ - _gemma3_vl_module.gemma3_vl_4b_finetune_config, - _gemma3_vl_module.gemma3_vl_12b_finetune_config, - _gemma3_vl_module.gemma3_vl_27b_finetune_config, + +# SFT configs (parameterless) +_GEMMA3_VL_SFT_FUNCS = [ + _gemma3_vl_module.gemma3_vl_4b_sft_config, + _gemma3_vl_module.gemma3_vl_12b_sft_config, + _gemma3_vl_module.gemma3_vl_27b_sft_config, ] +# PEFT configs (take peft_scheme parameter) +_GEMMA3_VL_PEFT_FUNCS = [ + _gemma3_vl_module.gemma3_vl_4b_peft_config, + _gemma3_vl_module.gemma3_vl_12b_peft_config, + _gemma3_vl_module.gemma3_vl_27b_peft_config, +] -def _safe_overrides_for(name: str) -> dict: - """Create safe test overrides for a given recipe function name.""" - overrides = { - "name": f"unit_{name}", - "dir": ".", - "dataset_type": "mock", - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - - # Large models/variants may set additional flags in recipes; keep harmless defaults - lname = name.lower() - if "12b" in lname or "27b" in lname: - overrides.update( - { - "virtual_pipeline_model_parallel_size": None, - "sequence_parallel": True, - } - ) - - return overrides +# All recipe functions +_GEMMA3_VL_ALL_FUNCS = _GEMMA3_VL_SFT_FUNCS + _GEMMA3_VL_PEFT_FUNCS class _FakeModelCfg: @@ -119,15 +100,13 @@ def _assert_basic_config(cfg): assert cfg.dataset.seq_length >= 1 -@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_RECIPE_FUNCS) -def test_each_gemma3_vl_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): - """Test that each Gemma3-VL recipe function builds a valid configuration.""" +@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_SFT_FUNCS) +def test_each_gemma3_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Gemma3-VL SFT recipe function builds a valid configuration.""" # Monkeypatch AutoBridge to return a fake model config monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - - cfg = recipe_func(**overrides) + cfg = recipe_func() _assert_basic_config(cfg) @@ -144,115 +123,78 @@ def test_each_gemma3_vl_recipe_builds_config(recipe_func: Callable, monkeypatch: assert hasattr(cfg.model, "freeze_vision_model") assert hasattr(cfg.model, "freeze_vision_projection") + # SFT configs should not have PEFT + assert cfg.peft is None -@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"]) -def test_gemma3_vl_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch): - """Test that different dataset_type values produce correct dataset providers.""" - # Monkeypatch AutoBridge - monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - - overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config") - overrides["dataset_type"] = dataset_type - - # For preloaded, we need to provide data paths - if dataset_type == "preloaded": - overrides["train_data_path"] = ["/fake/train.json"] - overrides["valid_data_path"] = ["/fake/valid.json"] - overrides["test_data_path"] = ["/fake/test.json"] - overrides["image_folder"] = "/fake/images" - - cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides) - - # Check that appropriate dataset provider is used - from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider - from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider - if dataset_type == "mock": - assert isinstance(cfg.dataset, MockVLMConversationProvider) - elif dataset_type == "hf": - assert isinstance(cfg.dataset, HFDatasetConversationProvider) - elif dataset_type == "preloaded": - assert isinstance(cfg.dataset, PreloadedVLMConversationProvider) +@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_PEFT_FUNCS) +def test_each_gemma3_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Gemma3-VL PEFT recipe function builds a valid configuration.""" + # Monkeypatch AutoBridge to return a fake model config + monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) + cfg = recipe_func() # Default peft_scheme="lora" -def test_gemma3_vl_freeze_options(monkeypatch: pytest.MonkeyPatch): - """Test that freeze options are correctly passed to the model config.""" - # Monkeypatch AutoBridge - monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) + _assert_basic_config(cfg) - overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config") - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = True - overrides["freeze_vision_projection"] = False + # Check that NullTokenizer is used + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" - cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides) + # Verify parallelism settings + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 - assert cfg.model.freeze_language_model is True - assert cfg.model.freeze_vision_model is True - assert cfg.model.freeze_vision_projection is False + # Verify freeze settings are set + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") + # PEFT configs should have PEFT configured + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") -def test_gemma3_vl_27b_pipeline_dtype(monkeypatch: pytest.MonkeyPatch): - """Test that 27B model sets pipeline_dtype correctly for full SFT.""" +@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_PEFT_FUNCS) +@pytest.mark.parametrize("peft_scheme", ["lora", "dora"]) +def test_gemma3_vl_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch): + """Test that different PEFT schemes are correctly applied for Gemma3-VL models.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config") - overrides["peft"] = None # Full SFT - - cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides) - - # The 27B model should set pipeline_dtype to bfloat16 for full SFT - assert cfg.model.pipeline_dtype == torch.bfloat16 + cfg = recipe_func(peft_scheme=peft_scheme) + _assert_basic_config(cfg) -# PEFT-specific tests -_GEMMA3_VL_FINETUNE_FUNCS = [ - _gemma3_vl_module.gemma3_vl_4b_finetune_config, - _gemma3_vl_module.gemma3_vl_12b_finetune_config, - _gemma3_vl_module.gemma3_vl_27b_finetune_config, -] + # Check PEFT config presence + assert cfg.peft is not None + # Verify PEFT config has expected attributes + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") -@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_FINETUNE_FUNCS) -@pytest.mark.parametrize("peft", ["lora", "dora", None]) -def test_gemma3_vl_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch): - """Test that PEFT and full SFT configurations are correctly applied for Gemma3-VL models.""" +def test_gemma3_vl_4b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 4B SFT has correct default parallelism and learning rate.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - overrides["peft"] = peft - - cfg = recipe_func(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config() _assert_basic_config(cfg) - # Check PEFT config presence - if peft in ["lora", "dora"]: - assert cfg.peft is not None - # Verify PEFT config has expected attributes - assert hasattr(cfg.peft, "dim") - assert hasattr(cfg.peft, "alpha") - elif peft is None: - assert cfg.peft is None + # For full SFT, 4B should use TP=1, PP=1 + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None -def test_gemma3_vl_4b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 4B LoRA has correct default parallelism and learning rate.""" +def test_gemma3_vl_4b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 4B LoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config") - overrides["peft"] = "lora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - # Don't override finetune_lr to test default - - cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config(peft_scheme="lora") _assert_basic_config(cfg) @@ -265,22 +207,13 @@ def test_gemma3_vl_4b_lora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft.dim == 32 assert cfg.peft.alpha == 32 - # Check that learning rate defaults to 1e-4 for LoRA - assert cfg.optimizer.lr == 1e-4 - -def test_gemma3_vl_4b_dora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 4B DoRA has correct default parallelism and learning rate.""" +def test_gemma3_vl_4b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 4B DoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config") - overrides["peft"] = "dora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config(peft_scheme="dora") _assert_basic_config(cfg) @@ -294,42 +227,27 @@ def test_gemma3_vl_4b_dora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft.alpha == 64 -def test_gemma3_vl_4b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 4B full SFT has correct default parallelism and learning rate.""" +def test_gemma3_vl_12b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 12B SFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config") - overrides["peft"] = None - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_12b_sft_config() _assert_basic_config(cfg) - # For full SFT, 4B should use TP=1, PP=1 - assert cfg.model.tensor_model_parallel_size == 1 + # For full SFT, 12B should use TP=4, PP=1 + assert cfg.model.tensor_model_parallel_size == 4 assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is None - # Check that learning rate defaults to 5e-6 for full SFT - assert cfg.optimizer.lr == 5e-6 - -def test_gemma3_vl_12b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 12B LoRA has correct default parallelism.""" +def test_gemma3_vl_12b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 12B PEFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_12b_finetune_config") - overrides["peft"] = "lora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _gemma3_vl_module.gemma3_vl_12b_finetune_config(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_12b_peft_config() _assert_basic_config(cfg) @@ -341,39 +259,30 @@ def test_gemma3_vl_12b_lora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft is not None -def test_gemma3_vl_12b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 12B full SFT has correct default parallelism.""" +def test_gemma3_vl_27b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 27B SFT has correct default parallelism and pipeline_dtype.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_12b_finetune_config") - overrides["peft"] = None - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _gemma3_vl_module.gemma3_vl_12b_finetune_config(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_27b_sft_config() _assert_basic_config(cfg) - # For full SFT, 12B should use TP=4, PP=1 - assert cfg.model.tensor_model_parallel_size == 4 - assert cfg.model.pipeline_model_parallel_size == 1 + # For full SFT, 27B should use TP=8, PP=2 + assert cfg.model.tensor_model_parallel_size == 8 + assert cfg.model.pipeline_model_parallel_size == 2 assert cfg.peft is None + # For full SFT, pipeline_dtype should be set to bfloat16 + assert cfg.model.pipeline_dtype == torch.bfloat16 + -def test_gemma3_vl_27b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 27B LoRA has correct default parallelism.""" +def test_gemma3_vl_27b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 27B PEFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config") - overrides["peft"] = "lora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_27b_peft_config() _assert_basic_config(cfg) @@ -388,43 +297,12 @@ def test_gemma3_vl_27b_lora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.model.pipeline_dtype is None -def test_gemma3_vl_27b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 27B full SFT has correct default parallelism.""" - - # Monkeypatch AutoBridge - monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - - overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config") - overrides["peft"] = None - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides) - - _assert_basic_config(cfg) - - # For full SFT, 27B should use TP=8, PP=2 - assert cfg.model.tensor_model_parallel_size == 8 - assert cfg.model.pipeline_model_parallel_size == 2 - assert cfg.peft is None - - # For full SFT, pipeline_dtype should be set to bfloat16 - assert cfg.model.pipeline_dtype == torch.bfloat16 - - -def test_gemma3_vl_27b_dora_defaults(monkeypatch: pytest.MonkeyPatch): +def test_gemma3_vl_27b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): """Test that 27B DoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config") - overrides["peft"] = "dora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides) + cfg = _gemma3_vl_module.gemma3_vl_27b_peft_config(peft_scheme="dora") _assert_basic_config(cfg) @@ -439,42 +317,80 @@ def test_gemma3_vl_27b_dora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.model.pipeline_dtype is None -def test_gemma3_vl_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch): - """Test that custom finetune_lr overrides default learning rate.""" +def test_gemma3_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs use HFDatasetConversationProvider by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config") - overrides["peft"] = "lora" - overrides["finetune_lr"] = 2e-4 # Custom learning rate + cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config() - cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides) + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - _assert_basic_config(cfg) + assert isinstance(cfg.dataset, HFDatasetConversationProvider) - # Check that custom learning rate is used - assert cfg.optimizer.lr == 2e-4 +def test_gemma3_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs use HFDatasetConversationProvider by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config() -def test_gemma3_vl_peft_with_freeze_options(monkeypatch: pytest.MonkeyPatch): - """Test that PEFT can be combined with freeze options.""" + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider + + assert isinstance(cfg.dataset, HFDatasetConversationProvider) + + +def test_gemma3_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs have freeze options set to False by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config") - overrides["peft"] = "lora" - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = False - overrides["freeze_vision_projection"] = True + cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config() - cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides) + # Default freeze options should be False for full SFT + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False - _assert_basic_config(cfg) - # Check PEFT config - assert cfg.peft is not None +def test_gemma3_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs have freeze options set to False by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config() - # Check freeze options - assert cfg.model.freeze_language_model is True + # Default freeze options should be False for PEFT + assert cfg.model.freeze_language_model is False assert cfg.model.freeze_vision_model is False - assert cfg.model.freeze_vision_projection is True + assert cfg.model.freeze_vision_projection is False + + +def test_gemma3_vl_precision_config(monkeypatch: pytest.MonkeyPatch): + """Test that precision config is correctly set.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config() + + _assert_basic_config(cfg) + + # Default should be bf16_mixed + assert cfg.mixed_precision == "bf16_mixed" + + +def test_gemma3_vl_ddp_config(monkeypatch: pytest.MonkeyPatch): + """Test that DDP config is correctly set for VLMs.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config() + + _assert_basic_config(cfg) + + # VLMs should have overlap disabled + assert cfg.ddp.overlap_grad_reduce is False + assert cfg.ddp.overlap_param_gather is False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is True diff --git a/tests/unit_tests/recipes/test_glm_45v_recipes.py b/tests/unit_tests/recipes/test_glm_45v_recipes.py index d60d6636c2..c459289aaf 100644 --- a/tests/unit_tests/recipes/test_glm_45v_recipes.py +++ b/tests/unit_tests/recipes/test_glm_45v_recipes.py @@ -16,9 +16,9 @@ # Test purpose: # - Parametrize over all exported GLM-4.5V recipe functions in `megatron.bridge.recipes.glm_vl.glm_45v`. # - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O. -# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`. +# - Build a config and assert it forms a valid `ConfigContainer`. # - Verify dataset provider selection and sanity-check parallelism fields. -# - Test pipeline model parallel layout for asymmetric stages. +# - Test MoE-specific settings for this MoE VLM model. # import importlib @@ -28,33 +28,19 @@ _glm_45v_module = importlib.import_module("megatron.bridge.recipes.glm_vl.glm_45v") -_GLM_45V_RECIPE_FUNCS = [ - _glm_45v_module.glm_45v_finetune_config, + +# SFT configs (parameterless) +_GLM_45V_SFT_FUNCS = [ + _glm_45v_module.glm_45v_sft_config, ] +# PEFT configs (take peft_scheme parameter) +_GLM_45V_PEFT_FUNCS = [ + _glm_45v_module.glm_45v_peft_config, +] -def _safe_overrides_for(name: str) -> dict: - """Create safe test overrides for a given recipe function name.""" - overrides = { - "name": f"unit_{name}", - "dir": ".", - "dataset_type": "mock", - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "finetune_lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "expert_model_parallel_size": 1, - "context_parallel_size": 1, - "sequence_parallel": False, - "virtual_pipeline_model_parallel_size": None, - } - - return overrides +# All recipe functions +_GLM_45V_ALL_FUNCS = _GLM_45V_SFT_FUNCS + _GLM_45V_PEFT_FUNCS class _FakeModelCfg: @@ -66,19 +52,23 @@ def __init__(self): self.pipeline_model_parallel_size = 1 self.pipeline_dtype = None self.virtual_pipeline_model_parallel_size = None - self.expert_model_parallel_size = 1 self.context_parallel_size = 1 + self.expert_model_parallel_size = 1 self.sequence_parallel = False self.seq_length = 64 self.freeze_language_model = False self.freeze_vision_model = False self.freeze_vision_projection = False - # Pipeline layout attributes - self.pipeline_model_parallel_layout = None - self.account_for_embedding_in_pipeline_split = True - self.account_for_loss_in_pipeline_split = True - self.num_layers_in_first_pipeline_stage = None - self.num_layers_in_last_pipeline_stage = None + # MoE-specific + self.moe_token_dispatcher_type = None + self.moe_flex_dispatcher_backend = None + self.moe_hybridep_num_sms = None + self.moe_router_fusion = False + self.moe_permute_fusion = False + self.moe_grouped_gemm = False + self.moe_router_padding_for_fp8 = False + self.moe_shared_expert_overlap = False + self.moe_router_force_load_balancing = False def finalize(self): return None @@ -117,15 +107,13 @@ def _assert_basic_config(cfg): assert cfg.dataset.seq_length >= 1 -@pytest.mark.parametrize("recipe_func", _GLM_45V_RECIPE_FUNCS) -def test_each_glm_45v_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): - """Test that each GLM-4.5V recipe function builds a valid configuration.""" +@pytest.mark.parametrize("recipe_func", _GLM_45V_SFT_FUNCS) +def test_each_glm_45v_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each GLM-4.5V SFT recipe function builds a valid configuration.""" # Monkeypatch AutoBridge to return a fake model config monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - - cfg = recipe_func(**overrides) + cfg = recipe_func() _assert_basic_config(cfg) @@ -136,156 +124,111 @@ def test_each_glm_45v_recipe_builds_config(recipe_func: Callable, monkeypatch: p # Verify parallelism settings assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 - assert getattr(cfg.model, "expert_model_parallel_size", 1) >= 1 # Verify freeze settings are set assert hasattr(cfg.model, "freeze_language_model") assert hasattr(cfg.model, "freeze_vision_model") assert hasattr(cfg.model, "freeze_vision_projection") + # SFT configs should not have PEFT + assert cfg.peft is None -@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"]) -def test_glm_45v_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch): - """Test that different dataset_type values produce correct dataset providers.""" - # Monkeypatch AutoBridge - monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["dataset_type"] = dataset_type - - # For preloaded, we need to provide data paths - if dataset_type == "preloaded": - overrides["train_data_path"] = ["/fake/train.json"] - overrides["valid_data_path"] = ["/fake/valid.json"] - overrides["test_data_path"] = ["/fake/test.json"] - overrides["image_folder"] = "/fake/images" - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) - # Check that appropriate dataset provider is used - from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider - from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider +@pytest.mark.parametrize("recipe_func", _GLM_45V_PEFT_FUNCS) +def test_each_glm_45v_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each GLM-4.5V PEFT recipe function builds a valid configuration.""" + # Monkeypatch AutoBridge to return a fake model config + monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - if dataset_type == "mock": - assert isinstance(cfg.dataset, MockVLMConversationProvider) - elif dataset_type == "hf": - assert isinstance(cfg.dataset, HFDatasetConversationProvider) - elif dataset_type == "preloaded": - assert isinstance(cfg.dataset, PreloadedVLMConversationProvider) + cfg = recipe_func() # Default peft_scheme="lora" + _assert_basic_config(cfg) -def test_glm_45v_freeze_options(monkeypatch: pytest.MonkeyPatch): - """Test that freeze options are correctly passed to the model config.""" - # Monkeypatch AutoBridge - monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) + # Check that NullTokenizer is used + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = True - overrides["freeze_vision_projection"] = False + # Verify parallelism settings + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + # Verify freeze settings are set + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") - assert cfg.model.freeze_language_model is True - assert cfg.model.freeze_vision_model is True - assert cfg.model.freeze_vision_projection is False + # PEFT configs should have PEFT configured + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") -def test_glm_45v_invalid_dataset_type(monkeypatch: pytest.MonkeyPatch): - """Test that invalid dataset_type raises ValueError.""" +@pytest.mark.parametrize("peft_scheme", ["lora", "dora"]) +def test_glm_45v_peft_schemes(peft_scheme: str, monkeypatch: pytest.MonkeyPatch): + """Test that different PEFT schemes are correctly applied for GLM-4.5V model.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["dataset_type"] = "invalid_type" - - with pytest.raises(ValueError, match="Unsupported dataset_type"): - _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_peft_config(peft_scheme=peft_scheme) + _assert_basic_config(cfg) -# PEFT-specific tests -_GLM_45V_FINETUNE_FUNCS = [ - _glm_45v_module.glm_45v_finetune_config, -] + # Check PEFT config presence + assert cfg.peft is not None + # Verify PEFT config has expected attributes + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") -@pytest.mark.parametrize("recipe_func", _GLM_45V_FINETUNE_FUNCS) -@pytest.mark.parametrize("peft", ["lora", "dora", None]) -def test_glm_45v_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch): - """Test that PEFT and full SFT configurations are correctly applied for GLM-4.5V models.""" +def test_glm_45v_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that GLM-4.5V SFT has correct default parallelism and MoE settings.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - overrides["peft"] = peft - - cfg = recipe_func(**overrides) + cfg = _glm_45v_module.glm_45v_sft_config() _assert_basic_config(cfg) - # Check PEFT config presence - if peft in ["lora", "dora"]: - assert cfg.peft is not None - # Verify PEFT config has expected attributes - assert hasattr(cfg.peft, "dim") - assert hasattr(cfg.peft, "alpha") - elif peft is None: - assert cfg.peft is None + # For full SFT, GLM-4.5V should use TP=1, PP=8, EP=16 + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 8 + assert cfg.peft is None + + # Check expert_model_parallel_size for MoE model + assert cfg.model.expert_model_parallel_size == 16 -def test_glm_45v_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that GLM-4.5V LoRA has correct default parallelism and learning rate.""" +def test_glm_45v_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that GLM-4.5V LoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["peft"] = "lora" - # Remove parallelism overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) - # Remove finetune_lr to test default - overrides.pop("finetune_lr", None) - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_peft_config(peft_scheme="lora") _assert_basic_config(cfg) - # For LoRA, GLM-4.5V should use TP=1, PP=8, EP=4 + # For LoRA, GLM-4.5V should use TP=1, PP=8 assert cfg.model.tensor_model_parallel_size == 1 assert cfg.model.pipeline_model_parallel_size == 8 - assert cfg.model.expert_model_parallel_size == 4 # Check PEFT config assert cfg.peft is not None assert cfg.peft.dim == 32 assert cfg.peft.alpha == 32 - # Check that learning rate defaults to 1e-4 for LoRA - assert cfg.optimizer.lr == 1e-4 - -def test_glm_45v_dora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that GLM-4.5V DoRA has correct default parallelism and learning rate.""" +def test_glm_45v_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that GLM-4.5V DoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["peft"] = "dora" - # Remove parallelism overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_peft_config(peft_scheme="dora") _assert_basic_config(cfg) - # For DoRA, GLM-4.5V should use same parallelism as LoRA + # For DoRA, should use same parallelism as LoRA (TP=1, PP=8) assert cfg.model.tensor_model_parallel_size == 1 assert cfg.model.pipeline_model_parallel_size == 8 - assert cfg.model.expert_model_parallel_size == 4 # Check PEFT config (DoRA has alpha=64 by default, unlike LoRA's alpha=32) assert cfg.peft is not None @@ -293,216 +236,54 @@ def test_glm_45v_dora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft.alpha == 64 -def test_glm_45v_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that GLM-4.5V full SFT has correct default parallelism and learning rate.""" +def test_glm_45v_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs use HFDatasetConversationProvider by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["peft"] = None - # Remove parallelism overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) - # Remove finetune_lr to test default - overrides.pop("finetune_lr", None) + cfg = _glm_45v_module.glm_45v_sft_config() - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) - - _assert_basic_config(cfg) - - # For full SFT, GLM-4.5V should use TP=1, PP=8, EP=16 - assert cfg.model.tensor_model_parallel_size == 1 - assert cfg.model.pipeline_model_parallel_size == 8 - assert cfg.model.expert_model_parallel_size == 16 - assert cfg.peft is None + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - # Check that learning rate defaults to 5e-6 for full SFT - assert cfg.optimizer.lr == 5e-6 + assert isinstance(cfg.dataset, HFDatasetConversationProvider) -def test_glm_45v_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch): - """Test that custom finetune_lr overrides default learning rate.""" +def test_glm_45v_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs use HFDatasetConversationProvider by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["peft"] = "lora" - overrides["finetune_lr"] = 2e-4 # Custom learning rate + cfg = _glm_45v_module.glm_45v_peft_config() - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) - - _assert_basic_config(cfg) + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - # Check that custom learning rate is used - assert cfg.optimizer.lr == 2e-4 + assert isinstance(cfg.dataset, HFDatasetConversationProvider) -def test_glm_45v_peft_with_freeze_options(monkeypatch: pytest.MonkeyPatch): - """Test that PEFT can be combined with freeze options.""" +def test_glm_45v_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs have freeze options set to False by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["peft"] = "lora" - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = False - overrides["freeze_vision_projection"] = True - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_sft_config() - _assert_basic_config(cfg) - - # Check PEFT config - assert cfg.peft is not None - - # Check freeze options - assert cfg.model.freeze_language_model is True + # Default freeze options should be False for full SFT + assert cfg.model.freeze_language_model is False assert cfg.model.freeze_vision_model is False - assert cfg.model.freeze_vision_projection is True - - -# Pipeline layout tests - - -def test_glm_45v_pipeline_layout_pp4(): - """Test pipeline layout for PP=4.""" - model_cfg = _FakeModelCfg() - model_cfg.pipeline_model_parallel_size = 4 - model_cfg.virtual_pipeline_model_parallel_size = 1 - - _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg) - - # PP=4 should have 4 stages - assert model_cfg.pipeline_model_parallel_layout is not None - assert len(model_cfg.pipeline_model_parallel_layout) == 4 - # First stage: embedding + 11 decoder layers - assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding" - # Last stage should have loss - assert "loss" in model_cfg.pipeline_model_parallel_layout[-1] - - -def test_glm_45v_pipeline_layout_pp8(): - """Test pipeline layout for PP=8.""" - model_cfg = _FakeModelCfg() - model_cfg.pipeline_model_parallel_size = 8 - model_cfg.virtual_pipeline_model_parallel_size = 1 - - _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg) - - # PP=8 should have 8 stages (full SFT layout: embedding+1, then 7*6, then 3+loss) - assert model_cfg.pipeline_model_parallel_layout is not None - assert len(model_cfg.pipeline_model_parallel_layout) == 8 - # First stage: embedding + 1 decoder layer - assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding" - assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 1 - # Last stage should have loss - assert "loss" in model_cfg.pipeline_model_parallel_layout[-1] - - -def test_glm_45v_pipeline_layout_pp16(): - """Test pipeline layout for PP=16.""" - model_cfg = _FakeModelCfg() - model_cfg.pipeline_model_parallel_size = 16 - model_cfg.virtual_pipeline_model_parallel_size = 1 - - _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg) - - # PP=16 should have 16 stages (full SFT layout: embedding alone, then 3*14, then 3+loss) - assert model_cfg.pipeline_model_parallel_layout is not None - assert len(model_cfg.pipeline_model_parallel_layout) == 16 - # First stage: embedding only (no decoder layers, to balance vision encoder cost) - assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding" - assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 0 - # Last stage should have loss - assert "loss" in model_cfg.pipeline_model_parallel_layout[-1] - - -def test_glm_45v_pipeline_layout_pp8_peft(): - """Test pipeline layout for PP=8 with PEFT.""" - model_cfg = _FakeModelCfg() - model_cfg.pipeline_model_parallel_size = 8 - model_cfg.virtual_pipeline_model_parallel_size = 1 - - _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg, is_peft=True) - - # PP=8 PEFT layout: embedding+5, then 6*6, then 5+loss - assert model_cfg.pipeline_model_parallel_layout is not None - assert len(model_cfg.pipeline_model_parallel_layout) == 8 - # First stage: embedding + 5 decoder layers - assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding" - assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 5 - # Last stage should have loss - assert "loss" in model_cfg.pipeline_model_parallel_layout[-1] - - -def test_glm_45v_pipeline_layout_pp16_peft(): - """Test pipeline layout for PP=16 with PEFT.""" - model_cfg = _FakeModelCfg() - model_cfg.pipeline_model_parallel_size = 16 - model_cfg.virtual_pipeline_model_parallel_size = 1 - - _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg, is_peft=True) - - # PP=16 PEFT layout: embedding+2, then 3*14, then 2+loss - assert model_cfg.pipeline_model_parallel_layout is not None - assert len(model_cfg.pipeline_model_parallel_layout) == 16 - # First stage: embedding + 2 decoder layers - assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding" - assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 2 - # Last stage should have loss - assert "loss" in model_cfg.pipeline_model_parallel_layout[-1] - - -def test_glm_45v_pipeline_layout_custom(): - """Test that custom pipeline layout overrides defaults.""" - model_cfg = _FakeModelCfg() - model_cfg.pipeline_model_parallel_size = 2 - model_cfg.virtual_pipeline_model_parallel_size = 1 - - custom_layout = [["embedding"] + ["decoder"] * 20, ["decoder"] * 26 + ["loss"]] - _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg, layout=custom_layout) - - # Custom layout should be used - assert model_cfg.pipeline_model_parallel_layout == custom_layout - - -def test_glm_45v_pipeline_layout_in_config(monkeypatch: pytest.MonkeyPatch): - """Test that pipeline layout is correctly set in the full config.""" - # Monkeypatch AutoBridge - monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["pipeline_model_parallel_size"] = 8 - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) - - _assert_basic_config(cfg) - - # Check that pipeline layout is set - assert cfg.model.pipeline_model_parallel_layout is not None - # Check that asymmetric pipeline split settings are disabled - assert cfg.model.account_for_embedding_in_pipeline_split is False - assert cfg.model.account_for_loss_in_pipeline_split is False + assert cfg.model.freeze_vision_projection is False -def test_glm_45v_wandb_logging(monkeypatch: pytest.MonkeyPatch): - """Test that W&B logging options are correctly passed.""" +def test_glm_45v_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs have freeze options set to False by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["wandb_project"] = "test_project" - overrides["wandb_entity"] = "test_entity" - overrides["wandb_exp_name"] = "test_exp" - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) - - _assert_basic_config(cfg) + cfg = _glm_45v_module.glm_45v_peft_config() - assert cfg.logger.wandb_project == "test_project" - assert cfg.logger.wandb_entity == "test_entity" - assert cfg.logger.wandb_exp_name == "test_exp" + # Default freeze options should be False for PEFT + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False def test_glm_45v_precision_config(monkeypatch: pytest.MonkeyPatch): @@ -510,9 +291,7 @@ def test_glm_45v_precision_config(monkeypatch: pytest.MonkeyPatch): # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_sft_config() _assert_basic_config(cfg) @@ -520,56 +299,64 @@ def test_glm_45v_precision_config(monkeypatch: pytest.MonkeyPatch): assert cfg.mixed_precision == "bf16_mixed" -def test_glm_45v_peft_none_string(monkeypatch: pytest.MonkeyPatch): - """Test that peft='none' (string) is treated as full SFT.""" +def test_glm_45v_ddp_config(monkeypatch: pytest.MonkeyPatch): + """Test that DDP config is correctly set for VLMs.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["peft"] = "none" - # Remove parallelism overrides to test recipe defaults - overrides.pop("expert_model_parallel_size", None) - overrides.pop("finetune_lr", None) - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_sft_config() _assert_basic_config(cfg) - # peft="none" should be treated as full SFT - assert cfg.peft is None - # Should use full SFT defaults: EP=16, LR=5e-6 - assert cfg.model.expert_model_parallel_size == 16 - assert cfg.optimizer.lr == 5e-6 + # VLMs should have overlap disabled + assert cfg.ddp.overlap_grad_reduce is False + assert cfg.ddp.overlap_param_gather is False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is True -def test_glm_45v_ddp_config(monkeypatch: pytest.MonkeyPatch): - """Test that DDP config is correctly set.""" +def test_glm_45v_moe_settings(monkeypatch: pytest.MonkeyPatch): + """Test that MoE-specific settings are correctly configured.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_sft_config() _assert_basic_config(cfg) - # Check DDP settings - assert cfg.ddp.check_for_nan_in_grad is True - assert cfg.ddp.grad_reduce_in_fp32 is True - assert cfg.ddp.use_distributed_optimizer is True - assert cfg.ddp.data_parallel_sharding_strategy == "optim_grads_params" + # Check MoE-specific settings + assert hasattr(cfg.model, "moe_token_dispatcher_type") + assert hasattr(cfg.model, "moe_flex_dispatcher_backend") + assert hasattr(cfg.model, "moe_hybridep_num_sms") + assert hasattr(cfg.model, "moe_router_fusion") + assert hasattr(cfg.model, "moe_permute_fusion") + assert hasattr(cfg.model, "moe_grouped_gemm") + assert hasattr(cfg.model, "moe_router_padding_for_fp8") + assert hasattr(cfg.model, "moe_shared_expert_overlap") + assert hasattr(cfg.model, "moe_router_force_load_balancing") + + # Verify default MoE kernel settings + assert cfg.model.moe_router_fusion is False + assert cfg.model.moe_permute_fusion is True + assert cfg.model.moe_grouped_gemm is True -def test_glm_45v_megatron_fsdp(monkeypatch: pytest.MonkeyPatch): - """Test that Megatron FSDP option is correctly passed.""" +def test_glm_45v_pipeline_layout_function_exists(): + """Test that pipeline layout function is exported.""" + assert hasattr(_glm_45v_module, "set_glm_45v_pipeline_model_parallel_layout") + assert callable(_glm_45v_module.set_glm_45v_pipeline_model_parallel_layout) + + +def test_glm_45v_sft_uses_pipeline_layout(monkeypatch: pytest.MonkeyPatch): + """Test that SFT config has pipeline model parallel layout set.""" # Monkeypatch AutoBridge monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("glm_45v_finetune_config") - overrides["use_megatron_fsdp"] = True - - cfg = _glm_45v_module.glm_45v_finetune_config(**overrides) + cfg = _glm_45v_module.glm_45v_sft_config() _assert_basic_config(cfg) - assert cfg.ddp.use_megatron_fsdp is True + # PP should be set when pipeline layout is used + assert cfg.model.pipeline_model_parallel_size >= 1 + # Check if pipeline_model_parallel_layout is set + assert hasattr(cfg.model, "pipeline_model_parallel_layout") diff --git a/tests/unit_tests/recipes/test_ministral3_recipes.py b/tests/unit_tests/recipes/test_ministral3_recipes.py index 836d87f41b..1b4fc878a8 100644 --- a/tests/unit_tests/recipes/test_ministral3_recipes.py +++ b/tests/unit_tests/recipes/test_ministral3_recipes.py @@ -16,7 +16,7 @@ # Test purpose: # - Parametrize over all exported Ministral3 recipe functions in `megatron.bridge.recipes.ministral3.ministral3`. # - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O. -# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`. +# - Build a config and assert it forms a valid `ConfigContainer`. # - Verify dataset provider selection and sanity-check parallelism fields. # @@ -27,42 +27,20 @@ _ministral3_module = importlib.import_module("megatron.bridge.recipes.ministral3.ministral3") -_MINISTRAL3_RECIPE_FUNCS = [ - _ministral3_module.ministral3_3b_finetune_config, - _ministral3_module.ministral3_8b_finetune_config, - _ministral3_module.ministral3_14b_finetune_config, -] +# SFT configs (parameterless) +_MINISTRAL3_SFT_FUNCS = [ + _ministral3_module.ministral3_3b_sft_config, + _ministral3_module.ministral3_8b_sft_config, + _ministral3_module.ministral3_14b_sft_config, +] -def _safe_overrides_for(name: str) -> dict: - """Create safe test overrides for a given recipe function name.""" - overrides = { - "name": f"unit_{name}", - "dir": ".", - "dataset_type": "mock", - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "finetune_lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - - # Large models may set additional flags in recipes; keep harmless defaults - lname = name.lower() - if "8b" in lname or "14b" in lname: - overrides.update( - { - "virtual_pipeline_model_parallel_size": None, - "sequence_parallel": False, - } - ) - - return overrides +# PEFT configs (take peft_scheme parameter) +_MINISTRAL3_PEFT_FUNCS = [ + _ministral3_module.ministral3_3b_peft_config, + _ministral3_module.ministral3_8b_peft_config, + _ministral3_module.ministral3_14b_peft_config, +] class _FakeModelCfg: @@ -118,15 +96,13 @@ def _assert_basic_config(cfg): assert cfg.dataset.seq_length >= 1 -@pytest.mark.parametrize("recipe_func", _MINISTRAL3_RECIPE_FUNCS) -def test_each_ministral3_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): - """Test that each Ministral3 recipe function builds a valid configuration.""" +@pytest.mark.parametrize("recipe_func", _MINISTRAL3_SFT_FUNCS) +def test_each_ministral3_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Ministral3 SFT recipe function builds a valid configuration.""" # Monkeypatch AutoBridge to return a fake model config monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - - cfg = recipe_func(**overrides) + cfg = recipe_func() _assert_basic_config(cfg) @@ -143,101 +119,78 @@ def test_each_ministral3_recipe_builds_config(recipe_func: Callable, monkeypatch assert hasattr(cfg.model, "freeze_vision_model") assert hasattr(cfg.model, "freeze_vision_projection") + # SFT configs should not have PEFT + assert cfg.peft is None -@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"]) -def test_ministral3_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch): - """Test that different dataset_type values produce correct dataset providers.""" - # Monkeypatch AutoBridge + +@pytest.mark.parametrize("recipe_func", _MINISTRAL3_PEFT_FUNCS) +def test_each_ministral3_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Ministral3 PEFT recipe function builds a valid configuration.""" + # Monkeypatch AutoBridge to return a fake model config monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_3b_finetune_config") - overrides["dataset_type"] = dataset_type + cfg = recipe_func() # Default peft_scheme="lora" - # For preloaded, we need to provide data paths - if dataset_type == "preloaded": - overrides["train_data_path"] = ["/fake/train.json"] - overrides["valid_data_path"] = ["/fake/valid.json"] - overrides["test_data_path"] = ["/fake/test.json"] - overrides["image_folder"] = "/fake/images" + _assert_basic_config(cfg) + + # Check that NullTokenizer is used + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" - cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides) + # Verify parallelism settings + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 - # Check that appropriate dataset provider is used - from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider - from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider + # Verify freeze settings are set + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") - if dataset_type == "mock": - assert isinstance(cfg.dataset, MockVLMConversationProvider) - elif dataset_type == "hf": - assert isinstance(cfg.dataset, HFDatasetConversationProvider) - elif dataset_type == "preloaded": - assert isinstance(cfg.dataset, PreloadedVLMConversationProvider) + # PEFT configs should have PEFT configured + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") -def test_ministral3_freeze_options(monkeypatch: pytest.MonkeyPatch): - """Test that freeze options are correctly passed to the model config.""" +@pytest.mark.parametrize("recipe_func", _MINISTRAL3_PEFT_FUNCS) +@pytest.mark.parametrize("peft_scheme", ["lora", "dora"]) +def test_ministral3_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch): + """Test that different PEFT schemes are correctly applied for Ministral3 models.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_3b_finetune_config") - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = True - overrides["freeze_vision_projection"] = False - - cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides) - - assert cfg.model.freeze_language_model is True - assert cfg.model.freeze_vision_model is True - assert cfg.model.freeze_vision_projection is False + cfg = recipe_func(peft_scheme=peft_scheme) + _assert_basic_config(cfg) -# PEFT-specific tests -_MINISTRAL3_FINETUNE_FUNCS = [ - _ministral3_module.ministral3_3b_finetune_config, - _ministral3_module.ministral3_8b_finetune_config, - _ministral3_module.ministral3_14b_finetune_config, -] + # Check PEFT config presence + assert cfg.peft is not None + # Verify PEFT config has expected attributes + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") -@pytest.mark.parametrize("recipe_func", _MINISTRAL3_FINETUNE_FUNCS) -@pytest.mark.parametrize("peft", ["lora", "dora", None]) -def test_ministral3_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch): - """Test that PEFT and full SFT configurations are correctly applied for Ministral3 models.""" +def test_ministral3_3b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 3B SFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - overrides["peft"] = peft - - cfg = recipe_func(**overrides) + cfg = _ministral3_module.ministral3_3b_sft_config() _assert_basic_config(cfg) - # Check PEFT config presence - if peft in ["lora", "dora"]: - assert cfg.peft is not None - # Verify PEFT config has expected attributes - assert hasattr(cfg.peft, "dim") - assert hasattr(cfg.peft, "alpha") - elif peft is None: - assert cfg.peft is None + # For full SFT, 3B should use TP=1, PP=1 + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None -def test_ministral3_3b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 3B LoRA has correct default parallelism and learning rate.""" +def test_ministral3_3b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 3B LoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_3b_finetune_config") - overrides["peft"] = "lora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - # Remove finetune_lr to test default - overrides.pop("finetune_lr", None) - - cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_3b_peft_config(peft_scheme="lora") _assert_basic_config(cfg) @@ -250,22 +203,13 @@ def test_ministral3_3b_lora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft.dim == 32 assert cfg.peft.alpha == 32 - # Check that learning rate defaults to 1e-4 for LoRA - assert cfg.optimizer.lr == 1e-4 - -def test_ministral3_3b_dora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 3B DoRA has correct default parallelism and learning rate.""" +def test_ministral3_3b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 3B DoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_3b_finetune_config") - overrides["peft"] = "dora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_3b_peft_config(peft_scheme="dora") _assert_basic_config(cfg) @@ -279,44 +223,27 @@ def test_ministral3_3b_dora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft.alpha == 64 -def test_ministral3_3b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 3B full SFT has correct default parallelism and learning rate.""" +def test_ministral3_8b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 8B SFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_3b_finetune_config") - overrides["peft"] = None - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - # Remove finetune_lr to test default - overrides.pop("finetune_lr", None) - - cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_8b_sft_config() _assert_basic_config(cfg) - # For full SFT, 3B should use TP=1, PP=1 - assert cfg.model.tensor_model_parallel_size == 1 + # For full SFT, 8B should use TP=2, PP=1 + assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is None - # Check that learning rate defaults to 5e-6 for full SFT - assert cfg.optimizer.lr == 5e-6 - -def test_ministral3_8b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 8B LoRA has correct default parallelism.""" +def test_ministral3_8b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 8B PEFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_8b_finetune_config") - overrides["peft"] = "lora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _ministral3_module.ministral3_8b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_8b_peft_config() _assert_basic_config(cfg) @@ -328,39 +255,27 @@ def test_ministral3_8b_lora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft is not None -def test_ministral3_8b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 8B full SFT has correct default parallelism.""" +def test_ministral3_14b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 14B SFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_8b_finetune_config") - overrides["peft"] = None - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _ministral3_module.ministral3_8b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_14b_sft_config() _assert_basic_config(cfg) - # For full SFT, 8B should use TP=2, PP=1 - assert cfg.model.tensor_model_parallel_size == 2 + # For full SFT, 14B should use TP=4, PP=1 + assert cfg.model.tensor_model_parallel_size == 4 assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is None -def test_ministral3_14b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 14B LoRA has correct default parallelism.""" +def test_ministral3_14b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 14B PEFT has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_14b_finetune_config") - overrides["peft"] = "lora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _ministral3_module.ministral3_14b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_14b_peft_config() _assert_basic_config(cfg) @@ -372,86 +287,85 @@ def test_ministral3_14b_lora_defaults(monkeypatch: pytest.MonkeyPatch): assert cfg.peft is not None -def test_ministral3_14b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 14B full SFT has correct default parallelism.""" +def test_ministral3_14b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 14B DoRA has correct default parallelism.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_14b_finetune_config") - overrides["peft"] = None - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _ministral3_module.ministral3_14b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_14b_peft_config(peft_scheme="dora") _assert_basic_config(cfg) - # For full SFT, 14B should use TP=4, PP=1 - assert cfg.model.tensor_model_parallel_size == 4 + # For DoRA, 14B should use same parallelism as LoRA (TP=2, PP=1) + assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.peft is None + # Check PEFT config + assert cfg.peft is not None -def test_ministral3_14b_dora_defaults(monkeypatch: pytest.MonkeyPatch): - """Test that 14B DoRA has correct default parallelism.""" + +def test_ministral3_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs use HFDatasetConversationProvider by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_14b_finetune_config") - overrides["peft"] = "dora" - # Remove TP/PP overrides to test recipe defaults - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) + cfg = _ministral3_module.ministral3_3b_sft_config() - cfg = _ministral3_module.ministral3_14b_finetune_config(**overrides) + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - _assert_basic_config(cfg) + assert isinstance(cfg.dataset, HFDatasetConversationProvider) - # For DoRA, 14B should use same parallelism as LoRA (TP=2, PP=1) - assert cfg.model.tensor_model_parallel_size == 2 - assert cfg.model.pipeline_model_parallel_size == 1 - # Check PEFT config - assert cfg.peft is not None +def test_ministral3_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs use HFDatasetConversationProvider by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) + cfg = _ministral3_module.ministral3_3b_peft_config() -def test_ministral3_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch): - """Test that custom finetune_lr overrides default learning rate.""" + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider + + assert isinstance(cfg.dataset, HFDatasetConversationProvider) + + +def test_ministral3_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs have freeze options set to False by default.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_3b_finetune_config") - overrides["peft"] = "lora" - overrides["finetune_lr"] = 2e-4 # Custom learning rate + cfg = _ministral3_module.ministral3_3b_sft_config() - cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides) + # Default freeze options should be False for full SFT + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_ministral3_precision_config(monkeypatch: pytest.MonkeyPatch): + """Test that precision config is correctly set.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) + + cfg = _ministral3_module.ministral3_3b_sft_config() _assert_basic_config(cfg) - # Check that custom learning rate is used - assert cfg.optimizer.lr == 2e-4 + # Default should be bf16_mixed + assert cfg.mixed_precision == "bf16_mixed" -def test_ministral3_peft_with_freeze_options(monkeypatch: pytest.MonkeyPatch): - """Test that PEFT can be combined with freeze options.""" +def test_ministral3_ddp_config(monkeypatch: pytest.MonkeyPatch): + """Test that DDP config is correctly set.""" # Monkeypatch AutoBridge monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("ministral3_3b_finetune_config") - overrides["peft"] = "lora" - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = False - overrides["freeze_vision_projection"] = True - - cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides) + cfg = _ministral3_module.ministral3_3b_sft_config() _assert_basic_config(cfg) - # Check PEFT config - assert cfg.peft is not None - - # Check freeze options - assert cfg.model.freeze_language_model is True - assert cfg.model.freeze_vision_model is False - assert cfg.model.freeze_vision_projection is True + # Check DDP settings + assert cfg.ddp.overlap_grad_reduce is False + assert cfg.ddp.overlap_param_gather is False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is True + assert cfg.ddp.grad_reduce_in_fp32 is True diff --git a/tests/unit_tests/recipes/test_nemotron_vl_recipes.py b/tests/unit_tests/recipes/test_nemotron_vl_recipes.py index f7fe85f512..82e1bd1a01 100644 --- a/tests/unit_tests/recipes/test_nemotron_vl_recipes.py +++ b/tests/unit_tests/recipes/test_nemotron_vl_recipes.py @@ -12,39 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. +# +# Test purpose: +# - Parametrize over all exported Nemotron VL recipe functions in `megatron.bridge.recipes.nemotron_vl`. +# - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O. +# - Build a config and assert it forms a valid `ConfigContainer`. +# - Verify dataset provider selection and sanity-check parallelism fields. +# + import importlib +from typing import Callable import pytest -import torch -_nemotron_module = importlib.import_module("megatron.bridge.recipes.nemotron_vl.nemotron_nano_v2_vl") +_nemotron_vl_module = importlib.import_module("megatron.bridge.recipes.nemotron_vl.nemotron_nano_v2_vl") +# SFT configs (parameterless) +_NEMOTRON_VL_SFT_FUNCS = [ + _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config, +] -def _safe_overrides() -> dict: - """Create safe test overrides for Nemotron VL recipe functions.""" - return { - "name": "unit_nemotron_vl", - "dir": ".", - "hf_model_path": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "tensor_parallelism": 1, - "pipeline_parallelism": 1, - "context_parallelism": 1, - "sequence_parallelism": False, - } +# PEFT configs (take peft_scheme parameter) +_NEMOTRON_VL_PEFT_FUNCS = [ + _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config, +] class _FakeModelCfg: """Fake model configuration for testing.""" def __init__(self): + # Set default attributes that recipes might set self.tensor_model_parallel_size = 1 self.pipeline_model_parallel_size = 1 self.pipeline_dtype = None @@ -61,17 +60,20 @@ def finalize(self): class _FakeAutoBridge: - """Fake AutoBridge for testing to avoid HF downloads and I/O.""" + """Fake AutoBridge for testing.""" @staticmethod - def from_hf_pretrained(hf_path: str, *args, **kwargs): + def from_hf_pretrained(hf_path: str, **kwargs): + """Mock from_hf_pretrained method.""" return _FakeAutoBridge() def to_megatron_provider(self, load_weights: bool = False): + """Return a fake model config.""" return _FakeModelCfg() def _assert_basic_config(cfg): + """Assert that a config has all required components.""" from megatron.bridge.training.config import ConfigContainer assert isinstance(cfg, ConfigContainer) @@ -90,104 +92,212 @@ def _assert_basic_config(cfg): assert cfg.dataset.seq_length >= 1 -def test_nemotron_vl_pretrain_builds_config(monkeypatch: pytest.MonkeyPatch): - """Test that pretrain_config builds a valid configuration and sets basic fields.""" - monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge) +@pytest.mark.parametrize("recipe_func", _NEMOTRON_VL_SFT_FUNCS) +def test_each_nemotron_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Nemotron VL SFT recipe function builds a valid configuration.""" + # Monkeypatch AutoBridge to return a fake model config + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func() + + _assert_basic_config(cfg) + + # Check that NullTokenizer is used + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + + # Verify parallelism settings + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 + + # Verify freeze settings are set + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") + + # SFT configs should not have PEFT + assert cfg.peft is None + + +@pytest.mark.parametrize("recipe_func", _NEMOTRON_VL_PEFT_FUNCS) +def test_each_nemotron_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Nemotron VL PEFT recipe function builds a valid configuration.""" + # Monkeypatch AutoBridge to return a fake model config + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = recipe_func() # Default peft_scheme="lora" + + _assert_basic_config(cfg) + + # Check that NullTokenizer is used + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + + # Verify parallelism settings + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 + + # Verify freeze settings are set + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") + + # PEFT configs should have PEFT configured + assert cfg.peft is not None + + +def test_nemotron_vl_12b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 12B SFT has correct default parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config() + + _assert_basic_config(cfg) + + # For full SFT, 12B should use TP=4, PP=1 + assert cfg.model.tensor_model_parallel_size == 4 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + + +def test_nemotron_vl_12b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that 12B PEFT has correct default parallelism.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides() - cfg = _nemotron_module.nemotron_nano_v2_vl_12b_pretrain_config(**overrides) + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config() _assert_basic_config(cfg) - # Dataset provider should be HF-based - from megatron.bridge.data.vlm_datasets import HFDatasetConversationProvider + # For PEFT, 12B should use TP=2, PP=1 + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + + # Check PEFT config (uses VLMLoRA) + assert cfg.peft is not None + + +def test_nemotron_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs use HFDatasetConversationProvider by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config() + + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider assert isinstance(cfg.dataset, HFDatasetConversationProvider) - # Null tokenizer is used - assert getattr(cfg.tokenizer, "tokenizer_type", None) == "NullTokenizer" - # Parallelism settings should be wired into model cfg - assert getattr(cfg.model, "tensor_model_parallel_size", 0) == overrides["tensor_parallelism"] - assert getattr(cfg.model, "pipeline_model_parallel_size", 0) == overrides["pipeline_parallelism"] - assert getattr(cfg.model, "context_parallel_size", 0) == overrides["context_parallelism"] - assert getattr(cfg.model, "sequence_parallel", None) is overrides["sequence_parallelism"] - assert getattr(cfg.model, "seq_length", 0) == overrides["seq_length"] +def test_nemotron_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs use HFDatasetConversationProvider by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config() + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider -def test_nemotron_vl_pretrain_pipeline_dtype(monkeypatch: pytest.MonkeyPatch): - """Test that pipeline_parallelism_dtype is respected.""" - monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge) + assert isinstance(cfg.dataset, HFDatasetConversationProvider) - overrides = _safe_overrides() - overrides["pipeline_parallelism_dtype"] = torch.bfloat16 - cfg = _nemotron_module.nemotron_nano_v2_vl_12b_pretrain_config(**overrides) +def test_nemotron_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs have freeze options set to False by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) - assert getattr(cfg.model, "pipeline_dtype", None) is torch.bfloat16 + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config() + # Default freeze options should be False for full SFT + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False -def test_nemotron_vl_finetune_with_lora(monkeypatch: pytest.MonkeyPatch): - """Test finetune_config wiring including LoRA when enabled.""" - monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides() - cfg = _nemotron_module.nemotron_nano_v2_vl_12b_finetune_config( - pretrained_checkpoint="/fake/ckpt", - lora_on_language_model=True, - lora_on_vision_model=False, - **overrides, - ) +def test_nemotron_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs have freeze options set to False by default.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config() + + # Default freeze options should be False for PEFT + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_nemotron_vl_precision_config(monkeypatch: pytest.MonkeyPatch): + """Test that precision config is correctly set.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config() _assert_basic_config(cfg) - # Check that checkpoint wiring includes the pretrained checkpoint - assert getattr(cfg.checkpoint, "pretrained_checkpoint", None) == "/fake/ckpt" + # Default should be bf16_mixed + assert cfg.mixed_precision == "bf16_mixed" - # LoRA should be configured - from megatron.bridge.peft.lora import VLMLoRA - assert isinstance(getattr(cfg, "peft", None), VLMLoRA) +def test_nemotron_vl_ddp_config(monkeypatch: pytest.MonkeyPatch): + """Test that DDP config is correctly set for VLMs.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) - # Finetune defaults applied (since overrides didn't provide finetune-specific lr) - assert hasattr(cfg.optimizer, "lr") and cfg.optimizer.lr == 5e-5 - assert hasattr(cfg.optimizer, "min_lr") and cfg.optimizer.min_lr == 5e-6 - assert getattr(cfg.model, "tensor_model_parallel_size", None) == 2 + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config() + _assert_basic_config(cfg) + + # VLMs should have overlap disabled + assert cfg.ddp.overlap_grad_reduce is False + assert cfg.ddp.overlap_param_gather is False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is True -def test_nemotron_vl_finetune_without_lora(monkeypatch: pytest.MonkeyPatch): - """Test finetune_config when LoRA is disabled.""" - monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides() - del overrides["lr"] - del overrides["min_lr"] - cfg = _nemotron_module.nemotron_nano_v2_vl_12b_finetune_config( - pretrained_checkpoint="/fake/ckpt", - lora_on_language_model=False, - **overrides, - ) +def test_nemotron_vl_peft_uses_vlm_lora(monkeypatch: pytest.MonkeyPatch): + """Test that Nemotron Nano V2 VL uses VLMLoRA for PEFT.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config() _assert_basic_config(cfg) - # No PEFT configured - assert getattr(cfg, "peft", None) is None + # Check PEFT config is present (should be VLMLoRA) + assert cfg.peft is not None - # Finetune defaults applied when not explicitly provided in overrides - assert hasattr(cfg.optimizer, "lr") and cfg.optimizer.lr == 1e-5 - assert hasattr(cfg.optimizer, "min_lr") and cfg.optimizer.min_lr == 1e-6 + # Check PEFT type is VLMLoRA + from megatron.bridge.peft.lora import VLMLoRA + assert isinstance(cfg.peft, VLMLoRA) -def test_nemotron_vl_finetune_custom_save_dir(monkeypatch: pytest.MonkeyPatch): - """Test that save_checkpoint_dir overrides are respected in finetune_config.""" - monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides() - cfg = _nemotron_module.nemotron_nano_v2_vl_12b_finetune_config( - pretrained_checkpoint="/fake/ckpt", - save_checkpoint_dir="/fake/save", - **overrides, - ) +def test_nemotron_vl_sft_training_params(monkeypatch: pytest.MonkeyPatch): + """Test that training parameters are correctly set for SFT.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config() + + _assert_basic_config(cfg) + + # Check training parameters + assert cfg.train.train_iters == 2000 + assert cfg.train.micro_batch_size == 1 + + +def test_nemotron_vl_peft_training_params(monkeypatch: pytest.MonkeyPatch): + """Test that training parameters are correctly set for PEFT.""" + # Monkeypatch AutoBridge + monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config() + + _assert_basic_config(cfg) - assert getattr(cfg.checkpoint, "save", None) == "/fake/save" - assert getattr(cfg.checkpoint, "load", None) == "/fake/save" + # Check training parameters (should match SFT after update) + assert cfg.train.train_iters == 2000 + assert cfg.train.micro_batch_size == 1