diff --git a/src/megatron/bridge/recipes/common.py b/src/megatron/bridge/recipes/common.py
index 8ea8fe0179..c422baea5b 100644
--- a/src/megatron/bridge/recipes/common.py
+++ b/src/megatron/bridge/recipes/common.py
@@ -16,9 +16,11 @@
 
 from megatron.core.distributed import DistributedDataParallelConfig
 
+from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
 from megatron.bridge.peft.lora import LoRA
 from megatron.bridge.recipes.utils.finetune_utils import default_squad_config
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
 from megatron.bridge.training.config import (
     CheckpointConfig,
     ConfigContainer,
@@ -335,3 +337,210 @@ def _peft_common() -> ConfigContainer:
     )
 
     return cfg
+
+
+def _sft_common_vlm() -> ConfigContainer:
+    """Create a base SFT ConfigContainer with common defaults for Vision-Language Models.
+
+    This function inherits from `_sft_common()` and overrides VLM-specific settings.
+    The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use.
+
+    Key differences from LLM SFT (`_sft_common`):
+    - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2)
+    - Uses NullTokenizer (VLMs use processor instead of tokenizer)
+    - DDP config optimized for VLM training (no grad/param overlap)
+    - Supports freeze options for language_model, vision_model, vision_projection
+    - Different training defaults (train_iters=300000, GBS=32, MBS=2)
+    - Different RNG seed (1234)
+
+    Returns:
+        ConfigContainer: Base configuration template for VLM full SFT.
+    """
+    # Start from the LLM SFT common config
+    cfg = _sft_common()
+
+    # Default output directories
+    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
+    run_output_dir = os.path.join(base_output_dir, "default")
+    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
+    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
+
+    # Default sequence length for VLM
+    seq_length = 4096
+
+    # VLM-specific training config - longer training with different batch sizes
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # VLM-specific validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # VLM-specific optimizer settings - higher LR for VLM training
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=None,  # Defaults to train_iters during validation
+        max_lr=3e-4,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # VLM-specific DDP config - no overlap for VLMs
+    cfg.ddp = DistributedDataParallelConfig(
+        check_for_nan_in_grad=True,
+        grad_reduce_in_fp32=True,
+        overlap_grad_reduce=False,
+        overlap_param_gather=False,
+        average_in_collective=True,
+        data_parallel_sharding_strategy="optim_grads_params",
+        use_distributed_optimizer=True,
+    )
+
+    # VLM-specific dataset - uses HuggingFace dataset provider
+    # hf_processor_path must be set by model-specific config
+    cfg.dataset = HFDatasetConversationProvider(
+        seq_length=seq_length,
+        hf_processor_path=None,  # Must be set by model-specific config
+        maker_name="make_cord_v2_dataset",
+        num_workers=2,
+        dataloader_type="single",
+        data_sharding=True,
+        pin_memory=True,
+        persistent_workers=False,
+        pack_sequences_in_batch=True,
+    )
+
+    # VLM uses NullTokenizer - actual tokenization is handled by the processor
+    cfg.tokenizer = TokenizerConfig(
+        tokenizer_type="NullTokenizer",
+        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
+    )
+
+    # VLM-specific logger config
+    cfg.logger = LoggerConfig(
+        log_interval=10,
+        tensorboard_dir=tensorboard_dir,
+        log_timers_to_tensorboard=True,
+    )
+
+    # VLM-specific checkpoint config
+    cfg.checkpoint.save_interval = 500
+    cfg.checkpoint.save = checkpoint_dir
+    cfg.checkpoint.load = checkpoint_dir
+    cfg.checkpoint.ckpt_format = "torch_dist"
+    cfg.checkpoint.fully_parallel_save = True
+
+    # VLM uses different RNG seed
+    cfg.rng = RNGConfig(seed=1234)
+
+    return cfg
+
+
+def _peft_common_vlm() -> ConfigContainer:
+    """Create a base PEFT ConfigContainer with LoRA defaults for Vision-Language Models.
+
+    This function inherits from `_peft_common()` and overrides VLM-specific settings.
+    The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use.
+
+    Key differences from LLM PEFT (`_peft_common`):
+    - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2)
+    - Uses NullTokenizer (VLMs use processor instead of tokenizer)
+    - DDP config optimized for VLM training (no grad/param overlap)
+    - Supports freeze options for language_model, vision_model, vision_projection
+    - Different training defaults (train_iters=300000, GBS=32, MBS=2)
+    - Different RNG seed (1234)
+    - Higher LR (1e-4) for adapter training
+
+    Returns:
+        ConfigContainer: Base configuration template for VLM PEFT with LoRA.
+    """
+    # Start from the LLM PEFT common config
+    cfg = _peft_common()
+
+    # Default output directories
+    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
+    run_output_dir = os.path.join(base_output_dir, "default")
+    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
+    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
+
+    # Default sequence length for VLM
+    seq_length = 4096
+
+    # VLM-specific training config - longer training with different batch sizes
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # VLM-specific validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # VLM-specific optimizer settings - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=None,  # Defaults to train_iters during validation
+        max_lr=1e-4,  # Higher LR for adapter training
+        min_lr=1e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # VLM-specific DDP config - no overlap for VLMs
+    cfg.ddp = DistributedDataParallelConfig(
+        check_for_nan_in_grad=True,
+        grad_reduce_in_fp32=True,
+        overlap_grad_reduce=False,
+        overlap_param_gather=False,
+        average_in_collective=True,
+        data_parallel_sharding_strategy="optim_grads_params",
+        use_distributed_optimizer=True,
+    )
+
+    # VLM-specific dataset - uses HuggingFace dataset provider
+    # hf_processor_path must be set by model-specific config
+    cfg.dataset = HFDatasetConversationProvider(
+        seq_length=seq_length,
+        hf_processor_path=None,  # Must be set by model-specific config
+        maker_name="make_cord_v2_dataset",
+        num_workers=2,
+        dataloader_type="single",
+        data_sharding=True,
+        pin_memory=True,
+        persistent_workers=False,
+        pack_sequences_in_batch=True,
+    )
+
+    # VLM uses NullTokenizer - actual tokenization is handled by the processor
+    cfg.tokenizer = TokenizerConfig(
+        tokenizer_type="NullTokenizer",
+        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
+    )
+
+    # VLM-specific logger config
+    cfg.logger = LoggerConfig(
+        log_interval=10,
+        tensorboard_dir=tensorboard_dir,
+        log_timers_to_tensorboard=True,
+    )
+
+    # VLM-specific checkpoint config
+    cfg.checkpoint.save_interval = 500
+    cfg.checkpoint.save = checkpoint_dir
+    cfg.checkpoint.load = checkpoint_dir
+    cfg.checkpoint.ckpt_format = "torch_dist"
+    cfg.checkpoint.fully_parallel_save = True
+
+    # VLM uses different RNG seed
+    cfg.rng = RNGConfig(seed=1234)
+
+    # Keep LoRA config from _peft_common() - it's already set with standard defaults
+
+    return cfg
diff --git a/src/megatron/bridge/recipes/gemma3_vl/__init__.py b/src/megatron/bridge/recipes/gemma3_vl/__init__.py
index de402836e9..f560a6c109 100644
--- a/src/megatron/bridge/recipes/gemma3_vl/__init__.py
+++ b/src/megatron/bridge/recipes/gemma3_vl/__init__.py
@@ -13,14 +13,20 @@
 # limitations under the License.
 
 from megatron.bridge.recipes.gemma3_vl.gemma3_vl import (
-    gemma3_vl_4b_finetune_config,
-    gemma3_vl_12b_finetune_config,
-    gemma3_vl_27b_finetune_config,
+    gemma3_vl_4b_peft_config,
+    gemma3_vl_4b_sft_config,
+    gemma3_vl_12b_peft_config,
+    gemma3_vl_12b_sft_config,
+    gemma3_vl_27b_peft_config,
+    gemma3_vl_27b_sft_config,
 )
 
 
 __all__ = [
-    "gemma3_vl_4b_finetune_config",
-    "gemma3_vl_12b_finetune_config",
-    "gemma3_vl_27b_finetune_config",
+    "gemma3_vl_4b_sft_config",
+    "gemma3_vl_12b_sft_config",
+    "gemma3_vl_27b_sft_config",
+    "gemma3_vl_4b_peft_config",
+    "gemma3_vl_12b_peft_config",
+    "gemma3_vl_27b_peft_config",
 ]
diff --git a/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py b/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py
index 7e29af13ab..66cc3b7d66 100644
--- a/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py
+++ b/src/megatron/bridge/recipes/gemma3_vl/gemma3_vl.py
@@ -12,339 +12,679 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import List, Optional, Union
+"""Gemma3-VL finetuning recipes with parameterless API.
+
+This module provides SFT and PEFT configurations for Gemma3-VL models (4B, 12B, 27B).
+"""
 
 import torch
-from typing_extensions import TypedDict, Unpack
 
 from megatron.bridge import AutoBridge
-from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider
-from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider
 from megatron.bridge.peft.base import PEFT
+from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
 from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DatasetProvider,
-    DistributedDataParallelConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-class Gemma3VLCommonKwargs(TypedDict, total=False):
-    """Typed options accepted by Gemma3-VL recipe helper functions."""
-
-    # Core identifiers
-    hf_path: str
-    dir: Optional[str]
-    name: str
-    # Dataset configuration
-    train_data_path: Optional[List[str]]
-    valid_data_path: Optional[List[str]]
-    test_data_path: Optional[List[str]]
-    dataset_type: Optional[str]
-    image_folder: Optional[str]
-    tokenizer_model: Optional[str]
+from megatron.bridge.training.config import ConfigContainer
+
+
+# =============================================================================
+# Gemma3-VL 4B SFT Configuration
+# =============================================================================
+def gemma3_vl_4b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Gemma3-VL 4B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
+    """
+    cfg = _sft_common_vlm()
+
     # Model configuration
-    tensor_model_parallel_size: int
-    pipeline_model_parallel_size: int
-    pipeline_dtype: Optional[torch.dtype]
-    virtual_pipeline_model_parallel_size: Optional[int]
-    context_parallel_size: int
-    sequence_parallel: bool
-    use_megatron_fsdp: bool
-    # Training hyperparameters
-    train_iters: int
-    global_batch_size: int
-    micro_batch_size: int
-    seq_length: int
-    lr: float
-    min_lr: float
-    lr_warmup_iters: int
-    lr_decay_iters: Optional[int]
-    eval_interval: int
-    save_interval: int
-    # Precision / overlap configs
-    precision_config: Optional[Union[MixedPrecisionConfig, str]]
-    comm_overlap_config: Optional[CommOverlapConfig]
-    # Freeze options
-    freeze_language_model: bool
-    freeze_vision_model: bool
-    freeze_vision_projection: bool
-    # Checkpoint options
-    pretrained_checkpoint: Optional[str]
-    # PEFT options
-    peft: Optional[Union[str, PEFT]]
-    finetune_lr: float
-    # W&B logging
-    wandb_project: Optional[str]
-    wandb_entity: Optional[str]
-    wandb_exp_name: Optional[str]
-
-
-def gemma3_vl_4b_finetune_config(**user_kwargs: Unpack[Gemma3VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Gemma3-VL 4B Instruct.
+    hf_path = "google/gemma-3-4b-it"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+    cfg.model.cp_comm_type = "a2a"
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - VLMs require no overlap
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Gemma3-VL 12B SFT Configuration
+# =============================================================================
+def gemma3_vl_12b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Gemma3-VL 12B Instruct.
 
     Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA: TP=1, PP=1, LR=1e-4
-    - Full SFT: TP=1, PP=1, LR=5e-6
+    - TP=4, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
+    """
+    cfg = _sft_common_vlm()
+
+    # Model configuration
+    hf_path = "google/gemma-3-12b-it"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 4
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+    cfg.model.cp_comm_type = "a2a"
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
 
-    See `_gemma3_vl_common` for the full list of parameters.
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - VLMs require no overlap
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Gemma3-VL 27B SFT Configuration
+# =============================================================================
+def gemma3_vl_27b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Gemma3-VL 27B Instruct.
+
+    Default configuration: 2 nodes, 16 GPUs total
+    - TP=8, PP=2
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
+    cfg = _sft_common_vlm()
 
-    recommended_kwargs: Gemma3VLCommonKwargs = {
-        "hf_path": "google/gemma-3-4b-it",
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Gemma3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _gemma3_vl_common(**combined_kwargs)
+    # Model configuration
+    hf_path = "google/gemma-3-27b-it"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
 
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 8
+    cfg.model.pipeline_model_parallel_size = 2
+    cfg.model.pipeline_dtype = torch.bfloat16
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+    cfg.model.cp_comm_type = "a2a"
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - VLMs require no overlap
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
 
-def gemma3_vl_12b_finetune_config(**user_kwargs: Unpack[Gemma3VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Gemma3-VL 12B Instruct.
+
+# =============================================================================
+# Gemma3-VL 4B PEFT Configuration
+# =============================================================================
+def gemma3_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Gemma3-VL 4B Instruct.
 
     Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA: TP=1, PP=1, LR=1e-4
-    - Full SFT: TP=4, PP=1, LR=5e-6
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
 
-    See `_gemma3_vl_common` for the full list of parameters.
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
+    cfg = _peft_common_vlm()
 
-    recommended_kwargs: Gemma3VLCommonKwargs = {
-        "hf_path": "google/gemma-3-12b-it",
-        "tensor_model_parallel_size": 4 if is_full_sft else 1,
-        "pipeline_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Gemma3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _gemma3_vl_common(**combined_kwargs)
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
 
+    # Model configuration
+    hf_path = "google/gemma-3-4b-it"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
 
-def gemma3_vl_27b_finetune_config(**user_kwargs: Unpack[Gemma3VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Gemma3-VL 27B Instruct.
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
 
-    Default configuration: 2 nodes, 16 GPUs total
-    - LoRA/DoRA: TP=4, PP=1, LR=1e-4
-    - Full SFT: TP=8, PP=2, LR=5e-6
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+    cfg.model.cp_comm_type = "a2a"
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.0002,
+        min_lr=0.00002,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
 
-    See `_gemma3_vl_common` for the full list of parameters.
-    """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
-
-    recommended_kwargs: Gemma3VLCommonKwargs = {
-        "hf_path": "google/gemma-3-27b-it",
-        "tensor_model_parallel_size": 8 if is_full_sft else 4,
-        "pipeline_model_parallel_size": 2 if is_full_sft else 1,
-        "pipeline_dtype": torch.bfloat16 if is_full_sft else None,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Gemma3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _gemma3_vl_common(**combined_kwargs)
-
-
-def _gemma3_vl_common(
-    hf_path: str,
-    dir: Optional[str] = None,
-    name: str = "gemma3_vl_finetune",
-    pretrained_checkpoint: Optional[str] = None,
     # Dataset configuration
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    dataset_type: Optional[str] = None,
-    image_folder: Optional[str] = None,
-    tokenizer_model: Optional[str] = None,
-    # Model configuration
-    tensor_model_parallel_size: int = 2,
-    pipeline_model_parallel_size: int = 1,
-    pipeline_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_model_parallel_size: Optional[int] = None,
-    context_parallel_size: int = 1,
-    sequence_parallel: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    eval_interval: int = 500,
-    save_interval: int = 500,
-    # Precision and comm overlap
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    # Freeze options
-    freeze_language_model: bool = False,
-    freeze_vision_model: bool = False,
-    freeze_vision_projection: bool = False,
-    # PEFT options
-    peft: Optional[Union[str, PEFT]] = None,
-    finetune_lr: Optional[float] = None,
-    # W&B logging
-    wandb_project: Optional[str] = None,
-    wandb_entity: Optional[str] = None,
-    wandb_exp_name: Optional[str] = None,
-) -> ConfigContainer:
-    """
-    Create a fine-tuning configuration for Gemma3-VL models using a given HuggingFace path.
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - VLMs require no overlap
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
 
-    The dataset pipeline is based on the Gemma3-VL architecture. To train multimodal tokens,
-    ensure your preprocessed data includes appropriate image placeholders.
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Gemma3-VL 12B PEFT Configuration
+# =============================================================================
+def gemma3_vl_12b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Gemma3-VL 12B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1 (lower than SFT for PEFT)
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Build provider via AutoBridge and set parallel/seq params here
-    bridge = AutoBridge.from_hf_pretrained(hf_path)
-    model_cfg = bridge.to_megatron_provider(load_weights=False)
-    model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
-    model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
-    model_cfg.pipeline_dtype = pipeline_dtype
-    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
-    model_cfg.context_parallel_size = context_parallel_size
-    model_cfg.sequence_parallel = sequence_parallel
-    model_cfg.freeze_language_model = freeze_language_model
-    model_cfg.freeze_vision_model = freeze_vision_model
-    model_cfg.freeze_vision_projection = freeze_vision_projection
-    model_cfg.seq_length = seq_length
-    model_cfg.cp_comm_type = "a2a"
-
-    # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr
-    effective_lr = finetune_lr if finetune_lr is not None else lr
-    if min_lr > effective_lr:
-        min_lr = effective_lr * 0.1
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters,
-        max_lr=effective_lr,
-        min_lr=min_lr,
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "google/gemma-3-12b-it"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower TP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+    cfg.model.cp_comm_type = "a2a"
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.0002,
+        min_lr=0.00002,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - VLMs require no overlap
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Gemma3-VL 27B PEFT Configuration
+# =============================================================================
+def gemma3_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Gemma3-VL 27B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=4, PP=1 (lower than SFT for PEFT)
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
+    """
+    cfg = _peft_common_vlm()
 
-    # PEFT config
-    peft_config = default_peft_config(peft)
-
-    # Determine dataset selection strategy.
-    _dataset_choice = dataset_type or "hf"
-    _processor_model = tokenizer_model or hf_path
-
-    if _dataset_choice == "mock":
-        dataset_cfg: DatasetProvider = MockVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            prompt="Describe this image.",
-            random_seed=0,
-            image_size=(256, 256),
-            pad_to_max_length=True,
-            create_attention_mask=True,
-            num_images=1,
-            dataloader_type="single",
-        )
-    elif _dataset_choice == "hf":
-        dataset_cfg = HFDatasetConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            maker_name="make_cord_v2_dataset",
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
-    elif _dataset_choice == "preloaded":
-        dataset_cfg = PreloadedVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path,
-            valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path,
-            test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path,
-            image_folder=image_folder,
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
     else:
-        raise ValueError(
-            f"Unsupported dataset_type '{_dataset_choice}'. Currently only 'mock' is supported for Gemma3-VL."
-        )
-
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=eval_interval,
-            eval_iters=32,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=False,
-            overlap_param_gather=False,
-            average_in_collective=True,
-            data_parallel_sharding_strategy="optim_grads_params",
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,
-        ),
-        dataset=dataset_cfg,
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-            wandb_project=wandb_project,
-            wandb_entity=wandb_entity,
-            wandb_exp_name=wandb_exp_name,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            pretrained_checkpoint=pretrained_checkpoint,
-            save_interval=save_interval,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        peft=peft_config,
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "google/gemma-3-27b-it"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower TP and PP for PEFT
+    cfg.model.tensor_model_parallel_size = 4
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+    cfg.model.cp_comm_type = "a2a"
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.0002,
+        min_lr=0.00002,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - VLMs require no overlap
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
     return cfg
diff --git a/src/megatron/bridge/recipes/glm_vl/__init__.py b/src/megatron/bridge/recipes/glm_vl/__init__.py
index 0e61b8f27c..96edc1c446 100644
--- a/src/megatron/bridge/recipes/glm_vl/__init__.py
+++ b/src/megatron/bridge/recipes/glm_vl/__init__.py
@@ -12,9 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .glm_45v import glm_45v_finetune_config
+from .glm_45v import (
+    glm_45v_peft_config,
+    glm_45v_sft_config,
+    set_glm_45v_pipeline_model_parallel_layout,
+)
 
 
 __all__ = [
-    "glm_45v_finetune_config",
+    "glm_45v_sft_config",
+    "glm_45v_peft_config",
+    "set_glm_45v_pipeline_model_parallel_layout",
 ]
diff --git a/src/megatron/bridge/recipes/glm_vl/glm_45v.py b/src/megatron/bridge/recipes/glm_vl/glm_45v.py
index 5536f6e83e..fb53b5b3dc 100644
--- a/src/megatron/bridge/recipes/glm_vl/glm_45v.py
+++ b/src/megatron/bridge/recipes/glm_vl/glm_45v.py
@@ -12,36 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
+"""GLM-4.5V finetuning recipes with parameterless API.
+
+This module provides SFT and PEFT configurations for GLM-4.5V (106B MoE).
+"""
+
 from typing import List, Optional, Union
 
 import torch
-from typing_extensions import TypedDict, Unpack
 
 from megatron.bridge import AutoBridge
-from megatron.bridge.data.vlm_datasets import (
-    HFDatasetConversationProvider,
-    MockVLMConversationProvider,
-    PreloadedVLMConversationProvider,
-)
 from megatron.bridge.models.gpt_provider import GPTModelProvider
 from megatron.bridge.peft.base import PEFT
+from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
 from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DatasetProvider,
-    DistributedDataParallelConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
+from megatron.bridge.training.config import ConfigContainer
 
 
 def set_glm_45v_pipeline_model_parallel_layout(
@@ -95,287 +81,296 @@ def set_glm_45v_pipeline_model_parallel_layout(
         model_cfg.pipeline_model_parallel_layout = layout_map[(pp_size, vp_size)]
 
 
-class GLM45VCommonKwargs(TypedDict, total=False):
-    """Typed options accepted by GLM-4.5V recipe helper functions."""
+# =============================================================================
+# GLM-4.5V SFT Configuration
+# =============================================================================
+def glm_45v_sft_config() -> ConfigContainer:
+    """Return a full SFT config for GLM-4.5V (106B MoE).
 
-    # Core identifiers
-    hf_path: str
-    dir: Optional[str]
-    name: str
-    # Dataset configuration
-    train_data_path: Optional[List[str]]
-    valid_data_path: Optional[List[str]]
-    test_data_path: Optional[List[str]]
-    dataset_type: Optional[str]
-    image_folder: Optional[str]
-    tokenizer_model: Optional[str]
-    # Model configuration
-    tensor_model_parallel_size: int
-    pipeline_model_parallel_size: int
-    pipeline_dtype: Optional[torch.dtype]
-    virtual_pipeline_model_parallel_size: Optional[int]
-    expert_model_parallel_size: int
-    context_parallel_size: int
-    sequence_parallel: bool
-    use_megatron_fsdp: bool
-    # Training hyperparameters
-    train_iters: int
-    global_batch_size: int
-    micro_batch_size: int
-    seq_length: int
-    lr: float
-    min_lr: float
-    lr_warmup_iters: int
-    lr_decay_iters: Optional[int]
-    eval_interval: int
-    save_interval: int
-    # Precision / overlap configs
-    precision_config: Optional[Union[MixedPrecisionConfig, str]]
-    comm_overlap_config: Optional[CommOverlapConfig]
-    # Freeze options
-    freeze_language_model: bool
-    freeze_vision_model: bool
-    freeze_vision_projection: bool
-    # Checkpoint options
-    pretrained_checkpoint: Optional[str]
-    # Pipeline layout
-    layout: Optional[Union[str, List[List[str]]]]
-    # PEFT options
-    peft: Optional[Union[str, PEFT]]
-    finetune_lr: float
-    # W&B logging
-    wandb_project: Optional[str]
-    wandb_entity: Optional[str]
-    wandb_exp_name: Optional[str]
-
-
-def glm_45v_finetune_config(**user_kwargs: Unpack[GLM45VCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for GLM-4.5V (based on GLM-4.5 Air 106B).
-
-    Default configuration:
-    - LoRA/DoRA: TP=1, PP=8, EP=4 (64 GPUs, 8 nodes), LR=1e-4
-    - Full SFT: TP=1, PP=8, EP=16 (512 GPUs, 64 nodes), LR=5e-6
-
-    GLM-4.5V is a Vision-Language model with:
-    - 106B total parameters (based on GLM-4.5 Air)
-    - Sparse MoE with shared experts
-    - Multi-modality support for images and videos
-
-    See `_glm_45v_common` for the full list of parameters.
+    Default configuration: 64 nodes, 512 GPUs
+    - TP=1, PP=8, EP=16
+    - LR=5e-6 (full SFT)
+    - Sequence length: 8192
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
-
-    recommended_kwargs: GLM45VCommonKwargs = {
-        "hf_path": "zai-org/GLM-4.5V",
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 8,
-        "pipeline_dtype": torch.bfloat16,
-        "expert_model_parallel_size": 16 if is_full_sft else 4,
-        "global_batch_size": 64 if is_full_sft else 32,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: GLM45VCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _glm_45v_common(**combined_kwargs)
-
-
-def _glm_45v_common(
-    hf_path: str,
-    dir: Optional[str] = None,
-    name: str = "glm_45v_finetune",
-    pretrained_checkpoint: Optional[str] = None,
-    # Dataset configuration
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    dataset_type: Optional[str] = None,
-    image_folder: Optional[str] = None,
-    tokenizer_model: Optional[str] = None,
-    # Model configuration
-    tensor_model_parallel_size: int = 1,
-    pipeline_model_parallel_size: int = 2,
-    pipeline_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_model_parallel_size: Optional[int] = None,
-    expert_model_parallel_size: int = 4,
-    context_parallel_size: int = 1,
-    sequence_parallel: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 1,
-    seq_length: int = 8192,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    eval_interval: int = 500,
-    save_interval: int = 500,
-    # Precision and comm overlap
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    # Freeze options
-    freeze_language_model: bool = False,
-    freeze_vision_model: bool = False,
-    freeze_vision_projection: bool = False,
-    # Pipeline layout
-    layout: Optional[Union[str, List[List[str]]]] = None,
-    # PEFT options
-    peft: Optional[Union[str, PEFT]] = None,
-    finetune_lr: Optional[float] = None,
-    # W&B logging
-    wandb_project: Optional[str] = None,
-    wandb_entity: Optional[str] = None,
-    wandb_exp_name: Optional[str] = None,
-) -> ConfigContainer:
-    """
-    Create a fine-tuning configuration for GLM-4.5V models using a given HuggingFace path.
+    cfg = _sft_common_vlm()
 
-    The dataset pipeline is conversation-based. To train multimodal tokens, ensure your
-    preprocessed data includes placeholders (e.g., <image>) as needed.
+    # Model configuration
+    hf_path = "zai-org/GLM-4.5V"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 8192
 
-    GLM-4.5V is a Vision-Language model based on GLM-4.5 Air (106B parameters) with:
-    - Sparse MoE architecture with shared experts
-    - Multi-modal support for images and videos
-    - MRoPE (Multi-Resolution Rotary Position Embedding)
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Build provider via AutoBridge and set parallel/seq params here
-    bridge = AutoBridge.from_hf_pretrained(hf_path)
-    model_cfg = bridge.to_megatron_provider(load_weights=False)
-    model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
-    model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
-    model_cfg.pipeline_dtype = pipeline_dtype
-    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
-    model_cfg.expert_model_parallel_size = expert_model_parallel_size
-    model_cfg.context_parallel_size = context_parallel_size
-    model_cfg.sequence_parallel = sequence_parallel
-    model_cfg.freeze_language_model = freeze_language_model
-    model_cfg.freeze_vision_model = freeze_vision_model
-    model_cfg.freeze_vision_projection = freeze_vision_projection
-    model_cfg.seq_length = seq_length
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 8
+    cfg.model.pipeline_dtype = torch.bfloat16
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.expert_model_parallel_size = 16
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
 
     # Set pipeline model parallel layout for asymmetric stages
-    set_glm_45v_pipeline_model_parallel_layout(model_cfg, layout, is_peft=peft is not None)
+    set_glm_45v_pipeline_model_parallel_layout(cfg.model, layout=None, is_peft=False)
 
     # Pipeline split for asymmetric stages are specified with the layout above
-    model_cfg.account_for_embedding_in_pipeline_split = False
-    model_cfg.account_for_loss_in_pipeline_split = False
-    model_cfg.num_layers_in_first_pipeline_stage = None
-    model_cfg.num_layers_in_last_pipeline_stage = None
-
-    # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr
-    # Ensure min_lr does not exceed max_lr (use 10% of effective_lr as default min)
-    effective_lr = finetune_lr if finetune_lr is not None else lr
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters,
-        max_lr=effective_lr,
-        min_lr=min(min_lr, effective_lr * 0.1),
+    cfg.model.account_for_embedding_in_pipeline_split = False
+    cfg.model.account_for_loss_in_pipeline_split = False
+    cfg.model.num_layers_in_first_pipeline_stage = None
+    cfg.model.num_layers_in_last_pipeline_stage = None
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (MoE)
+    cfg.model.moe_token_dispatcher_type = "alltoall"
+    cfg.model.moe_flex_dispatcher_backend = "deepep"
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = True
+    cfg.model.moe_grouped_gemm = True
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap
+    cfg.model.moe_shared_expert_overlap = True
+
+    # MoE force balance
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 64
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=5e-6,
+        min_lr=5e-6 * 0.1,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 8192
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - GLM-4.5V specific settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # FP8 settings (uncomment to use FP8)
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
 
-    # PEFT config
-    peft_config = default_peft_config(peft)
-
-    # Determine dataset selection strategy.
-    _dataset_choice = dataset_type or "hf"
-    _processor_model = tokenizer_model or hf_path
-
-    if _dataset_choice == "mock":
-        dataset_cfg: DatasetProvider = MockVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            prompt="Describe this image.",
-            num_workers=1,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-            create_attention_mask=True,
-            pad_to_max_length=True,
-        )
-    elif _dataset_choice == "preloaded":
-        dataset_cfg = PreloadedVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path,
-            valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path,
-            test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path,
-            image_folder=image_folder,
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
-    elif _dataset_choice == "hf":
-        dataset_cfg = HFDatasetConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            maker_name="make_cord_v2_dataset",
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
+# =============================================================================
+# GLM-4.5V PEFT Configuration
+# =============================================================================
+def glm_45v_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for GLM-4.5V (106B MoE).
+
+    Default configuration: 8 nodes, 64 GPUs
+    - TP=1, PP=8, EP=4
+    - LR=1e-4 (PEFT)
+    - Sequence length: 8192
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
+    """
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
     else:
-        raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].")
-
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=eval_interval,
-            eval_iters=32,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=False,
-            overlap_param_gather=False,
-            average_in_collective=True,
-            data_parallel_sharding_strategy="optim_grads_params",
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,
-        ),
-        dataset=dataset_cfg,
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-            wandb_project=wandb_project,
-            wandb_entity=wandb_entity,
-            wandb_exp_name=wandb_exp_name,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            pretrained_checkpoint=pretrained_checkpoint,
-            save_interval=save_interval,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        peft=peft_config,
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "zai-org/GLM-4.5V"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 8192
+
+    # Parallel settings - lower EP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 8
+    cfg.model.pipeline_dtype = torch.bfloat16
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.expert_model_parallel_size = 4
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # Set pipeline model parallel layout for asymmetric stages
+    set_glm_45v_pipeline_model_parallel_layout(cfg.model, layout=None, is_peft=True)
+
+    # Pipeline split for asymmetric stages are specified with the layout above
+    cfg.model.account_for_embedding_in_pipeline_split = False
+    cfg.model.account_for_loss_in_pipeline_split = False
+    cfg.model.num_layers_in_first_pipeline_stage = None
+    cfg.model.num_layers_in_last_pipeline_stage = None
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (MoE)
+    cfg.model.moe_token_dispatcher_type = "alltoall"
+    cfg.model.moe_flex_dispatcher_backend = "deepep"
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = True
+    cfg.model.moe_grouped_gemm = True
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap
+    cfg.model.moe_shared_expert_overlap = True
+
+    # MoE force balance
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=1e-4,
+        min_lr=1e-5,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 8192
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - GLM-4.5V specific settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # FP8 settings (uncomment to use FP8)
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
     return cfg
diff --git a/src/megatron/bridge/recipes/ministral3/__init__.py b/src/megatron/bridge/recipes/ministral3/__init__.py
index 195375c9bb..0e062c0ec5 100644
--- a/src/megatron/bridge/recipes/ministral3/__init__.py
+++ b/src/megatron/bridge/recipes/ministral3/__init__.py
@@ -14,15 +14,22 @@
 
 # Ministral3 models
 from .ministral3 import (
-    ministral3_3b_finetune_config,
-    ministral3_8b_finetune_config,
-    ministral3_14b_finetune_config,
+    ministral3_3b_peft_config,
+    ministral3_3b_sft_config,
+    ministral3_8b_peft_config,
+    ministral3_8b_sft_config,
+    ministral3_14b_peft_config,
+    ministral3_14b_sft_config,
 )
 
 
 __all__ = [
-    # Ministral3 models
-    "ministral3_3b_finetune_config",
-    "ministral3_8b_finetune_config",
-    "ministral3_14b_finetune_config",
+    # Ministral3 SFT configs
+    "ministral3_3b_sft_config",
+    "ministral3_8b_sft_config",
+    "ministral3_14b_sft_config",
+    # Ministral3 PEFT configs
+    "ministral3_3b_peft_config",
+    "ministral3_8b_peft_config",
+    "ministral3_14b_peft_config",
 ]
diff --git a/src/megatron/bridge/recipes/ministral3/ministral3.py b/src/megatron/bridge/recipes/ministral3/ministral3.py
index c52c17ecf7..8d6752b750 100644
--- a/src/megatron/bridge/recipes/ministral3/ministral3.py
+++ b/src/megatron/bridge/recipes/ministral3/ministral3.py
@@ -12,336 +12,709 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import List, Optional, Union
+"""Ministral3 finetuning recipes with parameterless API.
+
+This module provides SFT and PEFT configurations for Ministral3 models (3B, 8B, 14B).
+"""
 
 import torch
-from typing_extensions import TypedDict, Unpack
 
 from megatron.bridge import AutoBridge
-from megatron.bridge.data.vlm_datasets import (
-    HFDatasetConversationProvider,
-    MockVLMConversationProvider,
-    PreloadedVLMConversationProvider,
-)
 from megatron.bridge.peft.base import PEFT
+from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
 from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DatasetProvider,
-    DistributedDataParallelConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-class Ministral3FinetuneKwargs(TypedDict, total=False):
-    """Typed options accepted by Ministral3 finetuning recipe helper functions."""
-
-    # Core identifiers
-    hf_path: str
-    dir: Optional[str]
-    name: str
-    # Dataset configuration
-    train_data_path: Optional[List[str]]
-    valid_data_path: Optional[List[str]]
-    test_data_path: Optional[List[str]]
-    dataset_type: Optional[str]
-    image_folder: Optional[str]
-    tokenizer_model: Optional[str]
-    seq_length: Optional[int]
+from megatron.bridge.training.config import ConfigContainer
+
+
+# =============================================================================
+# Ministral3 3B SFT Configuration
+# =============================================================================
+def ministral3_3b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Ministral3 3B.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
+    """
+    cfg = _sft_common_vlm()
+
     # Model configuration
-    tensor_model_parallel_size: int
-    pipeline_model_parallel_size: int
-    pipeline_dtype: Optional[torch.dtype]
-    virtual_pipeline_model_parallel_size: Optional[int]
-    context_parallel_size: int
-    sequence_parallel: bool
-    use_megatron_fsdp: bool
-    # Training hyperparameters
-    train_iters: int
-    global_batch_size: Optional[int]
-    micro_batch_size: int
-    eval_interval: int
-    save_interval: int
-    # Optimizer
-    finetune_lr: Optional[float]
-    min_lr: float
-    lr_warmup_iters: int
-    lr_decay_iters: Optional[int]
-    # Precision / overlap configs
-    precision_config: Optional[Union[MixedPrecisionConfig, str]]
-    comm_overlap_config: Optional[CommOverlapConfig]
-    # Freeze options
-    freeze_language_model: bool
-    freeze_vision_model: bool
-    freeze_vision_projection: bool
-    # Checkpoint options
-    pretrained_checkpoint: Optional[str]
-    # PEFT options
-    peft: Optional[Union[str, PEFT]]
-    # W&B logging
-    wandb_project: Optional[str]
-    wandb_entity: Optional[str]
-    wandb_exp_name: Optional[str]
-
-
-def ministral3_3b_finetune_config(**user_kwargs: Unpack[Ministral3FinetuneKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Ministral3 3B.
+    hf_path = "mistralai/Ministral-3-3B-Instruct-2512"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 50
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Ministral3 8B SFT Configuration
+# =============================================================================
+def ministral3_8b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Ministral3 8B.
 
     Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA (default): TP=1, PP=1, LR=1e-4
-    - Full SFT (peft=None): TP=1, PP=1, LR=5e-6
+    - TP=2, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
+    """
+    cfg = _sft_common_vlm()
+
+    # Model configuration
+    hf_path = "mistralai/Ministral-3-8B-Instruct-2512"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 2
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
 
-    See `_ministral3_finetune_common` for the full list of parameters.
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 50
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Ministral3 14B SFT Configuration
+# =============================================================================
+def ministral3_14b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Ministral3 14B.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=4, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", "lora")
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
+    cfg = _sft_common_vlm()
 
-    recommended_kwargs: Ministral3FinetuneKwargs = {
-        "hf_path": "mistralai/Ministral-3-3B-Instruct-2512",
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Ministral3FinetuneKwargs = {**recommended_kwargs, **user_kwargs}
-    return _ministral3_finetune_common(**combined_kwargs)
+    # Model configuration
+    hf_path = "mistralai/Ministral-3-14B-Instruct-2512"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 4
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
 
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
 
-def ministral3_8b_finetune_config(**user_kwargs: Unpack[Ministral3FinetuneKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Ministral3 8B.
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 50
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Ministral3 3B PEFT Configuration
+# =============================================================================
+def ministral3_3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Ministral3 3B.
 
     Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA (default): TP=1, PP=1, LR=1e-4
-    - Full SFT (peft=None): TP=2, PP=1, LR=5e-6
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
 
-    See `_ministral3_finetune_common` for the full list of parameters.
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", "lora")
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "mistralai/Ministral-3-3B-Instruct-2512"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.0002,
+        min_lr=0.00002,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 50
 
-    recommended_kwargs: Ministral3FinetuneKwargs = {
-        "hf_path": "mistralai/Ministral-3-8B-Instruct-2512",
-        "tensor_model_parallel_size": 2 if is_full_sft else 1,
-        "pipeline_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Ministral3FinetuneKwargs = {**recommended_kwargs, **user_kwargs}
-    return _ministral3_finetune_common(**combined_kwargs)
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
 
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
 
-def ministral3_14b_finetune_config(**user_kwargs: Unpack[Ministral3FinetuneKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Ministral3 14B.
+
+# =============================================================================
+# Ministral3 8B PEFT Configuration
+# =============================================================================
+def ministral3_8b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Ministral3 8B.
 
     Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA (default): TP=2, PP=1, LR=1e-4
-    - Full SFT (peft=None): TP=4, PP=1, LR=5e-6
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
 
-    See `_ministral3_finetune_common` for the full list of parameters.
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", "lora")
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
-
-    recommended_kwargs: Ministral3FinetuneKwargs = {
-        "hf_path": "mistralai/Ministral-3-14B-Instruct-2512",
-        "tensor_model_parallel_size": 4 if is_full_sft else 2,
-        "pipeline_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Ministral3FinetuneKwargs = {**recommended_kwargs, **user_kwargs}
-    return _ministral3_finetune_common(**combined_kwargs)
-
-
-def _ministral3_finetune_common(
-    hf_path: str,
-    dir: Optional[str] = None,
-    name: str = "ministral3_finetune",
-    pretrained_checkpoint: Optional[str] = None,
-    # Dataset configuration
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    dataset_type: Optional[str] = None,
-    image_folder: Optional[str] = None,
-    tokenizer_model: Optional[str] = None,
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
     # Model configuration
-    tensor_model_parallel_size: int = 1,
-    pipeline_model_parallel_size: int = 1,
-    pipeline_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_model_parallel_size: Optional[int] = None,
-    context_parallel_size: int = 1,
-    sequence_parallel: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 1000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 1,
-    seq_length: int = 4096,
-    eval_interval: int = 30,
-    save_interval: int = 50,
-    # Optimizer
-    finetune_lr: Optional[float] = None,
-    min_lr: float = 0.0,
-    lr_warmup_iters: int = 50,
-    lr_decay_iters: Optional[int] = None,
-    # Precision and comm overlap
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    # Freeze options
-    freeze_language_model: bool = False,
-    freeze_vision_model: bool = False,
-    freeze_vision_projection: bool = False,
-    # PEFT options
-    peft: Optional[Union[str, PEFT]] = None,
-    # W&B logging
-    wandb_project: Optional[str] = None,
-    wandb_entity: Optional[str] = None,
-    wandb_exp_name: Optional[str] = None,
-) -> ConfigContainer:
-    """
-    Create a fine-tuning configuration for Ministral3 family models using a given HuggingFace path.
+    hf_path = "mistralai/Ministral-3-8B-Instruct-2512"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
 
-    The dataset pipeline is conversation-based. To train multimodal tokens, ensure your
-    preprocessed data includes placeholders (e.g., <image>) as needed.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Build provider via AutoBridge and set parallel/seq params here
-    bridge = AutoBridge.from_hf_pretrained(hf_path)
-    model_cfg = bridge.to_megatron_provider(load_weights=False)
-    model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
-    model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
-    model_cfg.pipeline_dtype = pipeline_dtype
-    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
-    model_cfg.context_parallel_size = context_parallel_size
-    model_cfg.sequence_parallel = sequence_parallel
-    model_cfg.freeze_language_model = freeze_language_model
-    model_cfg.freeze_vision_model = freeze_vision_model
-    model_cfg.freeze_vision_projection = freeze_vision_projection
-    model_cfg.seq_length = seq_length
-
-    # Optimizer and scheduler - use finetune_lr if provided, otherwise use default
-    effective_lr = finetune_lr if finetune_lr is not None else 1e-4
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters,
-        max_lr=effective_lr,
-        min_lr=min_lr,
+    # Parallel settings - lower TP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.0002,
+        min_lr=0.00002,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 50
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
-    # PEFT config
-    peft_config = default_peft_config(peft)
-
-    # Determine dataset selection strategy
-    _dataset_choice = dataset_type or "hf"
-    _processor_model = tokenizer_model or hf_path
-
-    if _dataset_choice == "mock":
-        dataset_cfg: DatasetProvider = MockVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            prompt="Describe this image.",
-            num_workers=1,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-            create_attention_mask=True,
-            pad_to_max_length=True,
-        )
-    elif _dataset_choice == "preloaded":
-        dataset_cfg = PreloadedVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path,
-            valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path,
-            test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path,
-            image_folder=image_folder,
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
-    elif _dataset_choice == "hf":
-        dataset_cfg = HFDatasetConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            maker_name="make_cord_v2_dataset",
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
+    return cfg
+
+
+# =============================================================================
+# Ministral3 14B PEFT Configuration
+# =============================================================================
+def ministral3_14b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Ministral3 14B.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=2, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
+    """
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
     else:
-        raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].")
-
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=eval_interval,
-            eval_iters=32,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=False,
-            overlap_param_gather=False,
-            average_in_collective=True,
-            data_parallel_sharding_strategy="optim_grads_params",
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,
-        ),
-        dataset=dataset_cfg,
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-            wandb_project=wandb_project,
-            wandb_entity=wandb_entity,
-            wandb_exp_name=wandb_exp_name,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            pretrained_checkpoint=pretrained_checkpoint,
-            save_interval=save_interval,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        peft=peft_config,
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "mistralai/Ministral-3-14B-Instruct-2512"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower TP for PEFT
+    cfg.model.tensor_model_parallel_size = 2
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 5
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.0002,
+        min_lr=0.00002,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 50
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
     return cfg
diff --git a/src/megatron/bridge/recipes/nemotron_vl/__init__.py b/src/megatron/bridge/recipes/nemotron_vl/__init__.py
index 0de786c5ef..4fe909096e 100644
--- a/src/megatron/bridge/recipes/nemotron_vl/__init__.py
+++ b/src/megatron/bridge/recipes/nemotron_vl/__init__.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 from .nemotron_nano_v2_vl import (
-    nemotron_nano_v2_vl_12b_finetune_config,
-    nemotron_nano_v2_vl_12b_pretrain_config,
+    nemotron_nano_v2_vl_12b_peft_config,
+    nemotron_nano_v2_vl_12b_sft_config,
 )
 
 
 __all__ = [
-    "nemotron_nano_v2_vl_12b_pretrain_config",
-    "nemotron_nano_v2_vl_12b_finetune_config",
+    "nemotron_nano_v2_vl_12b_sft_config",
+    "nemotron_nano_v2_vl_12b_peft_config",
 ]
diff --git a/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py b/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py
index c1824a336d..1ae2f13522 100644
--- a/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py
+++ b/src/megatron/bridge/recipes/nemotron_vl/nemotron_nano_v2_vl.py
@@ -12,240 +12,259 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import Optional, Union
+"""Nemotron Nano V2 VL finetuning recipes with parameterless API.
+
+This module provides SFT and PEFT configurations for Nemotron Nano V2 VL 12B.
+"""
 
 import torch
 
 from megatron.bridge import AutoBridge
-from megatron.bridge.data.vlm_datasets import (
-    HFDatasetConversationProvider,
-)
-from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider
+from megatron.bridge.peft.base import PEFT
 from megatron.bridge.peft.lora import VLMLoRA
+from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def nemotron_nano_v2_vl_12b_pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "nemotron_nano_v2_vl_pretrain",
-    hf_model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
-    # Dataset configuration
-    dataset_type: Optional[str] = None,
-    mock: bool = False,
-    dataset_maker_name: str = "make_cord_v2_dataset",
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision and comm overlap
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    # Checkpointing
-    save_interval: Optional[int] = 200,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Nemotron Nano V2 VL.
+from megatron.bridge.training.config import ConfigContainer
+
+
+# =============================================================================
+# Nemotron Nano V2 VL 12B SFT Configuration
+# =============================================================================
+def nemotron_nano_v2_vl_12b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Nemotron Nano V2 VL 12B.
 
-    Note: Current dataset pipeline is text-centric. To train multimodal tokens,
-    your preprocessed data should include placeholder tokens (e.g., <image>) as needed.
+    Default configuration: 1 node, 8 GPUs
+    - TP=4, PP=1
+    - LR=1e-5 (finetune default)
+    - Sequence length: 4096
     """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Build provider via AutoBridge and set parallel/seq params here
-    bridge = AutoBridge.from_hf_pretrained(hf_model_path, trust_remote_code=True)
-    model_cfg = bridge.to_megatron_provider(load_weights=False)
-    model_cfg.tensor_model_parallel_size = tensor_parallelism
-    model_cfg.pipeline_model_parallel_size = pipeline_parallelism
-    model_cfg.pipeline_dtype = pipeline_parallelism_dtype
-    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism
-    model_cfg.context_parallel_size = context_parallelism
-    model_cfg.sequence_parallel = sequence_parallelism
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
+    cfg = _sft_common_vlm()
+
+    # Model configuration
+    hf_path = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=True).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 4
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 2000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 0
+
+    # Optimizer - finetune defaults
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=5,
+        lr_decay_iters=None,
+        max_lr=2e-5,
+        min_lr=2e-6,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
 
-    # Dataset provider selection
-    _dataset_choice = (dataset_type or ("mock" if mock else "hf")).lower()
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - Nemotron uses average_in_collective=False
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = False
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 200
 
-    if _dataset_choice == "mock":
-        dataset_cfg = MockVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=hf_model_path,
-            dataloader_type="single",
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Nemotron Nano V2 VL 12B PEFT Configuration
+# =============================================================================
+def nemotron_nano_v2_vl_12b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Nemotron Nano V2 VL 12B.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=2, PP=1
+    - LR=5e-5 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
+            Note: Default uses VLMLoRA targeting all model components.
+    """
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme - Nemotron uses VLMLoRA by default
+    if isinstance(peft_scheme, str) and peft_scheme.lower() == "lora":
+        cfg.peft = VLMLoRA(
+            target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
+            dim=16,
+            alpha=32,
         )
-    elif _dataset_choice == "hf":
-        dataset_cfg = HFDatasetConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=hf_model_path,
-            maker_name=dataset_maker_name,
-            # Dataloader config parameters
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
+    elif isinstance(peft_scheme, str) and peft_scheme.lower() == "dora":
+        cfg.peft = VLMLoRA(
+            target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
+            dim=16,
+            alpha=32,
+            dora=True,
         )
     else:
-        raise ValueError(f"Unknown dataset_type '{_dataset_choice}'. Expected one of: 'mock', 'hf', 'preloaded'.")
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=500,
-            eval_iters=32,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=False,
-            overlap_param_gather=False,
-            average_in_collective=False,
-            data_parallel_sharding_strategy="optim_grads_params",
-            use_distributed_optimizer=True,
-            # use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=dataset_cfg,
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=save_interval,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
+        cfg.peft = peft_scheme
 
-    return cfg
+    # Model configuration
+    hf_path = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=True).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
 
+    # Parallel settings - lower TP for PEFT
+    cfg.model.tensor_model_parallel_size = 2
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
 
-def nemotron_nano_v2_vl_12b_finetune_config(
-    *,
-    pretrained_checkpoint: str = "",
-    lora_on_language_model: bool = False,
-    lora_on_vision_model: bool = False,
-    save_checkpoint_dir: Optional[str] = None,
-    **pretrain_kwargs,
-) -> ConfigContainer:
-    """Create a finetuning configuration for Nemotron Nano V2 VL.
-
-    This helper wraps :func:`nemotron_nano_v2_vl_12b_pretrain_config`, forwarding all keyword arguments to it
-    while additionally wiring up the :class:`CheckpointConfig` for finetuning from a
-    given *``pretrained_checkpoint``*.
-
-    Parameters:
-    pretrained_checkpoint: str
-        Path to a Megatron-Bridge checkpoint (or a directory produced by
-        ``convert_ckpt_hf_to_megatron``) that will be loaded before training.
-    save_checkpoint_dir: str | None, default ``run_output_dir / "checkpoints"``
-        Directory where new checkpoints will be saved / resumed from.  If not
-        provided, we reuse the default path chosen by *nemotron_nano_v2_vl_12b_pretrain_config*.
-    lora_on_language_model: bool = True
-        Whether to apply PEFT to the language model.
-    lora_on_vision_model: bool = True
-        Whether to apply PEFT to the vision model.
-    **pretrain_kwargs: Any
-        Additional keyword arguments are forwarded verbatim to
-        :func:`nemotron_nano_v2_vl_12b_pretrain_config` to customise the base recipe (e.g. batch size,
-        learning rate, parallelism).
-    """
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
 
-    cfg = nemotron_nano_v2_vl_12b_pretrain_config(**pretrain_kwargs)
-
-    # Override Train hyper-parameters suitable for finetuning if the caller did
-    # not explicitly pass them via **pretrain_kwargs.
-    if pretrain_kwargs.get("train_iters") is None:
-        cfg.train.train_iters = 10_000
-    if pretrain_kwargs.get("lr") is None and hasattr(cfg.optimizer, "lr"):
-        cfg.optimizer.lr = 1e-5  # type: ignore[attr-defined]
-    if pretrain_kwargs.get("min_lr") is None and hasattr(cfg.optimizer, "min_lr"):
-        cfg.optimizer.min_lr = 1e-6  # type: ignore[attr-defined]
-
-    # Update CheckpointConfig for finetuning.
-    ckpt_dir = save_checkpoint_dir or cfg.checkpoint.save or cfg.checkpoint.load  # type: ignore[attr-defined]
-    cfg.checkpoint = CheckpointConfig(
-        pretrained_checkpoint=pretrained_checkpoint,
-        save=ckpt_dir,
-        load=ckpt_dir,
-        ckpt_format=cfg.checkpoint.ckpt_format,  # preserve existing choice
-        fully_parallel_save=cfg.checkpoint.fully_parallel_save,
-        save_interval=cfg.checkpoint.save_interval,
+    # Kernel selections
+    cfg.model.attention_backend = "flash"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 2000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 1
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 0
+
+    # Optimizer - PEFT LR settings
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=5,
+        lr_decay_iters=None,
+        max_lr=2e-5,
+        min_lr=2e-6,
     )
-    if lora_on_language_model:
-        if lora_on_vision_model:
-            cfg.peft = VLMLoRA(
-                target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
-                dim=16,
-                alpha=32,
-            )
-        else:
-            cfg.peft = VLMLoRA(
-                target_modules=[
-                    "*language_model*.linear_qkv",
-                    "*language_model*.linear_proj",
-                    "*language_model*.linear_fc1",
-                    "*language_model*.linear_fc2",
-                ],
-                dim=16,
-                alpha=32,
-                freeze_vision_model=False,
-                freeze_vision_projection=False,
-            )
-
-        cfg.optimizer.lr = 5e-5
-        cfg.optimizer.min_lr = 5e-6
-        cfg.model.tensor_model_parallel_size = 2
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings - Nemotron uses average_in_collective=False
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = False
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Checkpoint config - override save_interval from common
+    cfg.checkpoint.save_interval = 200
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
     return cfg
diff --git a/src/megatron/bridge/recipes/qwen_vl/__init__.py b/src/megatron/bridge/recipes/qwen_vl/__init__.py
index afb6ac1048..4d89c691c3 100644
--- a/src/megatron/bridge/recipes/qwen_vl/__init__.py
+++ b/src/megatron/bridge/recipes/qwen_vl/__init__.py
@@ -12,24 +12,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Qwen3 models
+# Qwen2.5-VL models
+# Qwen3-VL models
 from .qwen3_vl import (
-    qwen3_vl_8b_finetune_config,
-    qwen3_vl_8b_pretrain_config,
-    qwen3_vl_30b_a3b_finetune_config,
-    qwen3_vl_30b_a3b_pretrain_config,
-    qwen3_vl_235b_a22b_finetune_config,
-    qwen3_vl_235b_a22b_pretrain_config,
+    qwen3_vl_8b_peft_config,
+    qwen3_vl_8b_sft_config,
+    qwen3_vl_30b_a3b_peft_config,
+    qwen3_vl_30b_a3b_sft_config,
+    qwen3_vl_235b_a22b_peft_config,
+    qwen3_vl_235b_a22b_sft_config,
+)
+from .qwen25_vl import (
+    qwen25_vl_3b_peft_config,
+    qwen25_vl_3b_sft_config,
+    qwen25_vl_7b_peft_config,
+    qwen25_vl_7b_sft_config,
+    qwen25_vl_32b_peft_config,
+    qwen25_vl_32b_sft_config,
+    qwen25_vl_72b_peft_config,
+    qwen25_vl_72b_sft_config,
 )
 
 
 __all__ = [
-    # Qwen3-VL pretrain configs
-    "qwen3_vl_8b_pretrain_config",
-    "qwen3_vl_30b_a3b_pretrain_config",
-    "qwen3_vl_235b_a22b_pretrain_config",
-    # Qwen3-VL finetune configs (with PEFT support)
-    "qwen3_vl_8b_finetune_config",
-    "qwen3_vl_30b_a3b_finetune_config",
-    "qwen3_vl_235b_a22b_finetune_config",
+    # Qwen2.5-VL SFT configs
+    "qwen25_vl_3b_sft_config",
+    "qwen25_vl_7b_sft_config",
+    "qwen25_vl_32b_sft_config",
+    "qwen25_vl_72b_sft_config",
+    # Qwen2.5-VL PEFT configs
+    "qwen25_vl_3b_peft_config",
+    "qwen25_vl_7b_peft_config",
+    "qwen25_vl_32b_peft_config",
+    "qwen25_vl_72b_peft_config",
+    # Qwen3-VL SFT configs
+    "qwen3_vl_8b_sft_config",
+    "qwen3_vl_30b_a3b_sft_config",
+    "qwen3_vl_235b_a22b_sft_config",
+    # Qwen3-VL PEFT configs
+    "qwen3_vl_8b_peft_config",
+    "qwen3_vl_30b_a3b_peft_config",
+    "qwen3_vl_235b_a22b_peft_config",
 ]
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py
index a7277f256d..8ee637f59b 100644
--- a/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py
@@ -12,362 +12,916 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import List, Optional, Union
+"""Qwen2.5-VL finetuning recipes with parameterless API.
+
+This module provides SFT and PEFT configurations for Qwen2.5-VL models (3B, 7B, 32B, 72B).
+"""
 
 import torch
-from typing_extensions import TypedDict, Unpack
 
 from megatron.bridge import AutoBridge
-from megatron.bridge.data.vlm_datasets import (
-    HFDatasetConversationProvider,
-    MockVLMConversationProvider,
-    PreloadedVLMConversationProvider,
-)
 from megatron.bridge.peft.base import PEFT
+from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
 from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DatasetProvider,
-    DistributedDataParallelConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-class Qwen25VLCommonKwargs(TypedDict, total=False):
-    """Typed options accepted by Qwen2.5-VL recipe helper functions."""
-
-    # Core identifiers
-    hf_path: str
-    dir: Optional[str]
-    name: str
-    # Dataset configuration
-    train_data_path: Optional[List[str]]
-    valid_data_path: Optional[List[str]]
-    test_data_path: Optional[List[str]]
-    dataset_type: Optional[str]
-    image_folder: Optional[str]
-    tokenizer_model: Optional[str]
-    # Model configuration
-    tensor_model_parallel_size: int
-    pipeline_model_parallel_size: int
-    pipeline_dtype: Optional[torch.dtype]
-    virtual_pipeline_model_parallel_size: Optional[int]
-    context_parallel_size: int
-    sequence_parallel: bool
-    use_megatron_fsdp: bool
-    # Training hyperparameters
-    train_iters: int
-    global_batch_size: int
-    micro_batch_size: int
-    seq_length: int
-    lr: float
-    min_lr: float
-    lr_warmup_iters: int
-    lr_decay_iters: Optional[int]
-    eval_interval: int
-    save_interval: int
-    # Precision / overlap configs
-    precision_config: Optional[Union[MixedPrecisionConfig, str]]
-    comm_overlap_config: Optional[CommOverlapConfig]
-    # Freeze options
-    freeze_language_model: bool
-    freeze_vision_model: bool
-    freeze_vision_projection: bool
-    # Checkpoint options
-    pretrained_checkpoint: Optional[str]
-    # PEFT options
-    peft: Optional[Union[str, PEFT]]
-    finetune_lr: float
-    # W&B logging
-    wandb_project: Optional[str]
-    wandb_entity: Optional[str]
-    wandb_exp_name: Optional[str]
-
-
-def qwen25_vl_3b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Qwen2.5-VL 3B Instruct.
+from megatron.bridge.training.config import ConfigContainer
 
-    Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA: TP=1, PP=1, LR=1e-4
-    - Full SFT: TP=1, PP=1, LR=5e-6
 
-    See `_qwen25_vl_common` for the full list of parameters.
+# =============================================================================
+# Qwen2.5-VL 3B SFT Configuration
+# =============================================================================
+def qwen25_vl_3b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Qwen2.5-VL 3B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
+    cfg = _sft_common_vlm()
 
-    recommended_kwargs: Qwen25VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen2.5-VL-3B-Instruct",
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen25_vl_common(**combined_kwargs)
+    # Model configuration
+    hf_path = "Qwen/Qwen2.5-VL-3B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=5e-6,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
 
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
-def qwen25_vl_7b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Qwen2.5-VL 7B Instruct.
+    return cfg
 
-    Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA: TP=1, PP=1, LR=1e-4
-    - Full SFT: TP=2, PP=1, LR=5e-6
 
-    See `_qwen25_vl_common` for the full list of parameters.
+# =============================================================================
+# Qwen2.5-VL 7B SFT Configuration
+# =============================================================================
+def qwen25_vl_7b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Qwen2.5-VL 7B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=2, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
+    cfg = _sft_common_vlm()
 
-    recommended_kwargs: Qwen25VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen2.5-VL-7B-Instruct",
-        "tensor_model_parallel_size": 2 if is_full_sft else 1,
-        "pipeline_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen25_vl_common(**combined_kwargs)
+    # Model configuration
+    hf_path = "Qwen/Qwen2.5-VL-7B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 2
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=5e-6,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
 
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
 
-def qwen25_vl_32b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Qwen2.5-VL 32B Instruct.
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
-    Default configuration: 2 nodes, 16 GPUs total
-    - LoRA/DoRA: TP=1, PP=1, LR=1e-4
-    - Full SFT: TP=8, PP=2, LR=5e-6
+    return cfg
+
+
+# =============================================================================
+# Qwen2.5-VL 32B SFT Configuration
+# =============================================================================
+def qwen25_vl_32b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Qwen2.5-VL 32B Instruct.
 
-    See `_qwen25_vl_common` for the full list of parameters.
+    Default configuration: 2 nodes, 16 GPUs total
+    - TP=8, PP=2
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
+    cfg = _sft_common_vlm()
+
+    # Model configuration
+    hf_path = "Qwen/Qwen2.5-VL-32B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 8
+    cfg.model.pipeline_model_parallel_size = 2
+    cfg.model.pipeline_dtype = torch.bfloat16
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=5e-6,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
 
-    recommended_kwargs: Qwen25VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen2.5-VL-32B-Instruct",
-        "tensor_model_parallel_size": 8 if is_full_sft else 1,
-        "pipeline_model_parallel_size": 2 if is_full_sft else 1,
-        "pipeline_dtype": torch.bfloat16 if is_full_sft else None,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen25_vl_common(**combined_kwargs)
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
+    return cfg
 
-def qwen25_vl_72b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Qwen2.5-VL 72B Instruct.
 
-    Default configuration: 4 nodes, 32 GPUs total
-    - LoRA/DoRA: TP=1, PP=1, LR=1e-4
-    - Full SFT: TP=8, PP=4, LR=5e-6
+# =============================================================================
+# Qwen2.5-VL 72B SFT Configuration
+# =============================================================================
+def qwen25_vl_72b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Qwen2.5-VL 72B Instruct.
 
-    See `_qwen25_vl_common` for the full list of parameters.
+    Default configuration: 4 nodes, 32 GPUs total
+    - TP=8, PP=4
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
-
-    recommended_kwargs: Qwen25VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen2.5-VL-72B-Instruct",
-        "tensor_model_parallel_size": 8 if is_full_sft else 1,
-        "pipeline_model_parallel_size": 4 if is_full_sft else 1,
-        "pipeline_dtype": torch.bfloat16 if is_full_sft else None,
-        "peft": peft_value,
-        "finetune_lr": 5e-6 if is_full_sft else 1e-4,
-    }
-    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen25_vl_common(**combined_kwargs)
-
-
-def _qwen25_vl_common(
-    hf_path: str,
-    dir: Optional[str] = None,
-    name: str = "qwen25_vl_finetune",
-    pretrained_checkpoint: Optional[str] = None,
+    cfg = _sft_common_vlm()
+
+    # Model configuration
+    hf_path = "Qwen/Qwen2.5-VL-72B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 8
+    cfg.model.pipeline_model_parallel_size = 4
+    cfg.model.pipeline_dtype = torch.bfloat16
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=5e-6,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
     # Dataset configuration
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    dataset_type: Optional[str] = None,
-    image_folder: Optional[str] = None,
-    tokenizer_model: Optional[str] = None,
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Qwen2.5-VL 3B PEFT Configuration
+# =============================================================================
+def qwen25_vl_3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Qwen2.5-VL 3B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
+    """
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
     # Model configuration
-    tensor_model_parallel_size: int = 2,
-    pipeline_model_parallel_size: int = 1,
-    pipeline_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_model_parallel_size: Optional[int] = None,
-    context_parallel_size: int = 1,
-    sequence_parallel: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    eval_interval: int = 500,
-    save_interval: int = 500,
-    # Precision and comm overlap
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    # Freeze options
-    freeze_language_model: bool = False,
-    freeze_vision_model: bool = False,
-    freeze_vision_projection: bool = False,
-    # PEFT options
-    peft: Optional[Union[str, PEFT]] = None,
-    finetune_lr: Optional[float] = None,
-    # W&B logging
-    wandb_project: Optional[str] = None,
-    wandb_entity: Optional[str] = None,
-    wandb_exp_name: Optional[str] = None,
-) -> ConfigContainer:
+    hf_path = "Qwen/Qwen2.5-VL-3B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=1e-4,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Qwen2.5-VL 7B PEFT Configuration
+# =============================================================================
+def qwen25_vl_7b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Qwen2.5-VL 7B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    Create a fine-tuning configuration for Qwen2.5-VL models using a given HuggingFace path.
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "Qwen/Qwen2.5-VL-7B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower TP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=1e-4,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Qwen2.5-VL 32B PEFT Configuration
+# =============================================================================
+def qwen25_vl_32b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Qwen2.5-VL 32B Instruct.
 
-    The dataset pipeline is conversation-based. To train multimodal tokens, ensure your
-    preprocessed data includes placeholders (e.g., <image>) as needed.
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Build provider via AutoBridge and set parallel/seq params here
-    bridge = AutoBridge.from_hf_pretrained(hf_path)
-    model_cfg = bridge.to_megatron_provider(load_weights=False)
-    model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
-    model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
-    model_cfg.pipeline_dtype = pipeline_dtype
-    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
-    model_cfg.context_parallel_size = context_parallel_size
-    model_cfg.sequence_parallel = sequence_parallel
-    model_cfg.freeze_language_model = freeze_language_model
-    model_cfg.freeze_vision_model = freeze_vision_model
-    model_cfg.freeze_vision_projection = freeze_vision_projection
-    model_cfg.seq_length = seq_length
-
-    # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr
-    effective_lr = finetune_lr if finetune_lr is not None else lr
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters,
-        max_lr=effective_lr,
-        min_lr=min_lr,
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "Qwen/Qwen2.5-VL-32B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower TP/PP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=1e-4,
+        min_lr=3e-5,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
 
-    # PEFT config
-    peft_config = default_peft_config(peft)
-
-    # Determine dataset selection strategy.
-    _dataset_choice = dataset_type or "hf"
-    _processor_model = tokenizer_model or hf_path
-
-    if _dataset_choice == "mock":
-        dataset_cfg: DatasetProvider = MockVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            prompt="Describe this image.",
-            num_workers=1,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-            create_attention_mask=True,
-            pad_to_max_length=True,
-        )
-    elif _dataset_choice == "preloaded":
-        dataset_cfg = PreloadedVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path,
-            valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path,
-            test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path,
-            image_folder=image_folder,
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
-    elif _dataset_choice == "hf":
-        dataset_cfg = HFDatasetConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            maker_name="make_cord_v2_dataset",
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
+
+# =============================================================================
+# Qwen2.5-VL 72B PEFT Configuration
+# =============================================================================
+def qwen25_vl_72b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Qwen2.5-VL 72B Instruct.
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
+
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
+    """
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
     else:
-        raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].")
-
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=eval_interval,
-            eval_iters=32,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=False,
-            overlap_param_gather=False,
-            average_in_collective=True,
-            data_parallel_sharding_strategy="optim_grads_params",
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,
-        ),
-        dataset=dataset_cfg,
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-            wandb_project=wandb_project,
-            wandb_entity=wandb_entity,
-            wandb_exp_name=wandb_exp_name,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            pretrained_checkpoint=pretrained_checkpoint,
-            save_interval=save_interval,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        peft=peft_config,
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "Qwen/Qwen2.5-VL-72B-Instruct"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower TP/PP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=1e-4,
+        min_lr=3e-5,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
     return cfg
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
index 5a60e3b8a7..d6b6acda6d 100644
--- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
@@ -12,540 +12,858 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import List, Optional, Union
+"""Qwen3-VL finetuning recipes with parameterless API.
+
+This module provides SFT and PEFT configurations for Qwen3-VL MoE models (8B, 30B-A3B, 235B-A22B).
+"""
 
 import torch
-from transformers import AutoTokenizer, Qwen3VLProcessor
-from typing_extensions import TypedDict, Unpack
 
 from megatron.bridge import AutoBridge
-from megatron.bridge.data.vlm_datasets import (
-    EnergonProvider,
-    HFDatasetConversationProvider,
-    MockVLMConversationProvider,
-    PreloadedVLMConversationProvider,
-)
 from megatron.bridge.peft.base import PEFT
-from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import QwenVLTaskEncoder
-from megatron.bridge.recipes.utils.finetune_utils import default_peft_config as _default_peft_config
+from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
+from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DatasetProvider,
-    DistributedDataParallelConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
+from megatron.bridge.training.config import ConfigContainer
 from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed
 
 
-class Qwen3VLCommonKwargs(TypedDict, total=False):
-    """Typed options accepted by Qwen3 VL MoE recipe helpers."""
+# =============================================================================
+# Qwen3-VL 8B SFT Configuration
+# =============================================================================
+def qwen3_vl_8b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Qwen3-VL 8B (dense model).
+
+    Default configuration: 1 node, 8 GPUs
+    - TP=2, PP=1
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
+    """
+    cfg = _sft_common_vlm()
 
-    # Core identifiers
-    hf_path: str
-    dir: Optional[str]
-    name: str
-    # Dataset configuration
-    data_paths: Optional[List[str]]
-    data_args_path: Optional[str]
-    train_data_path: Optional[List[str]]
-    valid_data_path: Optional[List[str]]
-    test_data_path: Optional[List[str]]
-    per_split_data_args_path: Optional[str]
-    mock: bool
     # Model configuration
-    tensor_model_parallel_size: int
-    pipeline_model_parallel_size: int
-    pipeline_dtype: Optional[torch.dtype]
-    virtual_pipeline_model_parallel_size: Optional[int]
-    context_parallel_size: int
-    expert_model_parallel_size: Optional[int]
-    expert_tensor_parallel_size: int
-    sequence_parallel: bool
-    use_megatron_fsdp: bool
-    enable_recompute: bool
-    account_for_embedding_in_pipeline_split: bool
-    account_for_loss_in_pipeline_split: bool
-    # Training hyperparameters
-    train_iters: int
-    global_batch_size: int
-    micro_batch_size: int
-    seq_length: int
-    lr: float
-    min_lr: float
-    lr_warmup_iters: int
-    lr_decay_iters: Optional[int]
-    eval_interval: int
-    save_interval: int
-    use_null_tokenizer: bool
-    # Precision / overlap configs
-    precision_config: Optional[Union[MixedPrecisionConfig, str]]
-    comm_overlap_config: Optional[CommOverlapConfig]
-    moe_flex_dispatcher_backend: str | None
-    # Freeze options
-    pretrained_checkpoint: Optional[str]
-    freeze_language_model: bool
-    freeze_vision_model: bool
-    freeze_vision_projection: bool
+    hf_path = "Qwen/Qwen3-VL-8B"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 2
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (not MoE for 8B)
+    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_flex_dispatcher_backend = None
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # Apply flex dispatcher backend (will be no-op for non-MoE model)
+    apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None)
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections (not applicable for dense 8B model)
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = False
+    cfg.model.moe_grouped_gemm = False
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap (not applicable for dense model)
+    cfg.model.moe_shared_expert_overlap = False
+
+    # MoE force balance (not applicable for dense model)
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding (not applicable for dense model)
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
     # Dataset configuration
-    dataset_type: Optional[str]
-    image_folder: Optional[str]
-    tokenizer_model: Optional[str]
-    # PEFT options
-    peft: Optional[Union[str, PEFT]]
-    finetune_lr: float
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
+    return cfg
 
-def qwen3_vl_8b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
-    """Return a pre-training config for Qwen3-VL 8B Instruct.
 
-    See `_qwen3_vl_common` for the full list of parameters.
-    """
-    recommended_kwargs: Qwen3VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen3-VL-8B-Instruct",
-        "tensor_model_parallel_size": 4,
-        "pipeline_model_parallel_size": 1,
-        "expert_model_parallel_size": 1,
-        "freeze_language_model": True,
-        "freeze_vision_model": True,
-        "freeze_vision_projection": False,
-    }
-    combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen3_vl_common(**combined_kwargs)
-
-
-def qwen3_vl_30b_a3b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
-    """Return a pre-training config for Qwen3-VL-30B-A3B-Instruct.
-
-    See `_qwen3_vl_common` for the full list of parameters.
+# =============================================================================
+# Qwen3-VL 30B-A3B SFT Configuration
+# =============================================================================
+def qwen3_vl_30b_a3b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Qwen3-VL 30B-A3B (MoE model).
+
+    Default configuration: 4 nodes, 32 GPUs
+    - TP=1, PP=1, EP=8
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    recommended_kwargs: Qwen3VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen3-VL-30B-A3B-Instruct",
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "pipeline_dtype": torch.bfloat16,
-        "expert_model_parallel_size": 8,
-        "freeze_language_model": False,
-        "freeze_vision_model": False,
-        "freeze_vision_projection": False,
-    }
-    # Combine defaults with user kwargs; user values take precedence.
-    combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen3_vl_common(**combined_kwargs)
-
-
-def qwen3_vl_235b_a22b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
-    """Return a pre-training config for Qwen3-VL-235B-A22B-Instruct.
-
-    See `_qwen3_vl_common` for the full list of parameters.
+    cfg = _sft_common_vlm()
+
+    # Model configuration
+    hf_path = "Qwen/Qwen3-VL-30B-A3B"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.expert_model_parallel_size = 8
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (MoE)
+    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_flex_dispatcher_backend = None
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch)
+    apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None)
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = True
+    cfg.model.moe_grouped_gemm = True
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap
+    cfg.model.moe_shared_expert_overlap = False
+
+    # MoE force balance
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 50
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 10
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=10,
+        lr_decay_iters=50,
+        max_lr=0.00005,
+        min_lr=0.000005,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Qwen3-VL 235B-A22B SFT Configuration
+# =============================================================================
+def qwen3_vl_235b_a22b_sft_config() -> ConfigContainer:
+    """Return a full SFT config for Qwen3-VL 235B-A22B (MoE model).
+
+    Default configuration: 64 nodes, 512 GPUs
+    - TP=4, PP=1, EP=32
+    - LR=5e-6 (full SFT)
+    - Sequence length: 4096
     """
-    recommended_kwargs: Qwen3VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen3-VL-235B-A22B-Instruct",
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 8,
-        "pipeline_dtype": torch.bfloat16,
-        "expert_model_parallel_size": 8,
-        "account_for_embedding_in_pipeline_split": True,
-        "account_for_loss_in_pipeline_split": True,
-        "freeze_language_model": False,
-        "freeze_vision_model": False,
-        "freeze_vision_projection": False,
-    }
-    # Combine defaults with user kwargs; user values take precedence.
-    combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen3_vl_common(**combined_kwargs)
-
-
-def qwen3_vl_8b_finetune_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Qwen3-VL 8B Instruct.
+    cfg = _sft_common_vlm()
+
+    # Model configuration
+    hf_path = "Qwen/Qwen3-VL-235B-A22B"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings
+    cfg.model.tensor_model_parallel_size = 4
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.expert_model_parallel_size = 32
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (MoE)
+    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_flex_dispatcher_backend = None
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch)
+    apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None)
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = True
+    cfg.model.moe_grouped_gemm = True
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap
+    cfg.model.moe_shared_expert_overlap = False
+
+    # MoE force balance
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - lower LR for full SFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=5e-6,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Qwen3-VL 8B PEFT Configuration
+# =============================================================================
+def qwen3_vl_8b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Qwen3-VL 8B (dense model).
 
     Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA: TP=1, PP=1, LR=1e-4
-    - Full SFT: TP=4, PP=1, LR=1e-5
+    - TP=1, PP=1
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
 
-    See `_qwen3_vl_common` for the full list of parameters.
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
-
-    recommended_kwargs: Qwen3VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen3-VL-8B-Instruct",
-        "tensor_model_parallel_size": 4 if is_full_sft else 1,
-        "pipeline_model_parallel_size": 1,
-        "pipeline_dtype": torch.bfloat16,
-        "expert_model_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 1e-5 if is_full_sft else 1e-4,
-        "freeze_language_model": True,
-        "freeze_vision_model": True,
-        "freeze_vision_projection": False,
-        "min_lr": 1e-6,
-        "lr": 1e-5,
-        "lr_warmup_iters": 200,
-        "micro_batch_size": 1,
-        "global_batch_size": 32,
-    }
-    combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen3_vl_common(**combined_kwargs)
-
-
-def qwen3_vl_30b_a3b_finetune_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Qwen3-VL-30B-A3B-Instruct.
-
-    This is a Mixture-of-Experts model with 128 experts and top-8 routing.
-    Recommended to use with expert parallelism (EP) for efficient training.
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "Qwen/Qwen3-VL-8B"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower TP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (not MoE for 8B)
+    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_flex_dispatcher_backend = None
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # Apply flex dispatcher backend (will be no-op for non-MoE model)
+    apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None)
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections (not applicable for dense 8B model)
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = False
+    cfg.model.moe_grouped_gemm = False
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap (not applicable for dense model)
+    cfg.model.moe_shared_expert_overlap = False
+
+    # MoE force balance (not applicable for dense model)
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding (not applicable for dense model)
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=1e-4,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Qwen3-VL 30B-A3B PEFT Configuration
+# =============================================================================
+def qwen3_vl_30b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Qwen3-VL 30B-A3B (MoE model).
 
     Default configuration: 1 node, 8 GPUs
-    - LoRA/DoRA: TP=1, PP=1, EP=8, LR=2e-4
-    - Full SFT: TP=1, PP=1, EP=8, LR=2e-5
+    - TP=1, PP=1, EP=4
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
 
-    See `_qwen3_vl_common` for the full list of parameters.
-    """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
-
-    recommended_kwargs: Qwen3VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen3-VL-30B-A3B-Instruct",
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "pipeline_dtype": torch.bfloat16,
-        "expert_model_parallel_size": 8,
-        "peft": peft_value,
-        "finetune_lr": 2e-5 if is_full_sft else 2e-4,
-        "freeze_language_model": True,
-        "freeze_vision_model": True,
-        "freeze_vision_projection": False,
-        "min_lr": 2e-6,
-        "lr": 2e-5,
-        "lr_warmup_iters": 200,
-        "micro_batch_size": 1,
-        "global_batch_size": 32,
-    }
-    # Combine defaults with user kwargs; user values take precedence.
-    combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen3_vl_common(**combined_kwargs)
-
-
-def qwen3_vl_235b_a22b_finetune_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
-    """Return a fine-tuning config for Qwen3-VL-235B-A22B-Instruct.
-
-    This is a Mixture-of-Experts model with 128 experts and top-8 routing.
-    Recommended to use with expert parallelism (EP) for efficient training.
-
-    Default configuration: 4 nodes, 32 GPUs total
-    - LoRA/DoRA: TP=1, PP=1, EP=8, LR=2e-4
-    - Full SFT: TP=4, PP=1, EP=8, LR=2e-5
-
-    See `_qwen3_vl_common` for the full list of parameters.
+    Args:
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    # Check if user is doing full SFT or PEFT
-    peft_value = user_kwargs.get("peft", None)
-    is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
-
-    recommended_kwargs: Qwen3VLCommonKwargs = {
-        "hf_path": "Qwen/Qwen3-VL-235B-A22B-Instruct",
-        "tensor_model_parallel_size": 4 if is_full_sft else 1,
-        "pipeline_model_parallel_size": 1,
-        "pipeline_dtype": torch.bfloat16,
-        "expert_model_parallel_size": 8,
-        "expert_tensor_parallel_size": 1,
-        "peft": peft_value,
-        "finetune_lr": 2e-5 if is_full_sft else 2e-4,
-        "freeze_language_model": True,
-        "freeze_vision_model": True,
-        "freeze_vision_projection": False,
-        "min_lr": 2e-6,
-        "lr": 2e-5,
-        "lr_warmup_iters": 200,
-        "micro_batch_size": 1,
-        "global_batch_size": 32,
-    }
-    combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
-    return _qwen3_vl_common(**combined_kwargs)
-
-
-def _qwen3_vl_common(
-    hf_path: str,
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
+    cfg = _peft_common_vlm()
+
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
+    else:
+        cfg.peft = peft_scheme
+
     # Model configuration
-    tensor_model_parallel_size: int = 4,
-    pipeline_model_parallel_size: int = 2,
-    pipeline_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_model_parallel_size: Optional[int] = None,
-    context_parallel_size: int = 1,
-    expert_model_parallel_size: Optional[int] = 4,
-    expert_tensor_parallel_size: int = 1,
-    sequence_parallel: bool = False,
-    use_megatron_fsdp: bool = False,
-    enable_recompute: bool = False,
-    account_for_embedding_in_pipeline_split: bool = False,
-    account_for_loss_in_pipeline_split: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    eval_interval: int = 500,
-    save_interval: int = 500,
-    use_null_tokenizer: bool = False,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = None,
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    moe_flex_dispatcher_backend: Optional[str] = None,
-    # Freeze options
-    pretrained_checkpoint: Optional[str] = None,
-    freeze_language_model: bool = True,
-    freeze_vision_model: bool = True,
-    freeze_vision_projection: bool = False,
+    hf_path = "Qwen/Qwen3-VL-30B-A3B"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower EP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.expert_model_parallel_size = 4
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (MoE)
+    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_flex_dispatcher_backend = None
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch)
+    apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None)
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = True
+    cfg.model.moe_grouped_gemm = True
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap
+    cfg.model.moe_shared_expert_overlap = False
+
+    # MoE force balance
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=1e-4,
+        min_lr=3e-5,
+    )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
     # Dataset configuration
-    dataset_type: Optional[str] = None,
-    image_folder: Optional[str] = None,
-    tokenizer_model: Optional[str] = None,
-    # PEFT options
-    peft: Optional[Union[str, PEFT]] = None,
-    finetune_lr: Optional[float] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 MoE models using a given HuggingFace path.
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
+
+    return cfg
+
+
+# =============================================================================
+# Qwen3-VL 235B-A22B PEFT Configuration
+# =============================================================================
+def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT config for Qwen3-VL 235B-A22B (MoE model).
+
+    Default configuration: 8 nodes, 64 GPUs
+    - TP=1, PP=1, EP=16
+    - LR=1e-4 (PEFT)
+    - Sequence length: 4096
 
     Args:
-        hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen3-30B-A3B", "Qwen/Qwen3-235B-A22B").
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_model_parallel_size (int): Degree of tensor model parallelism.
-        pipeline_model_parallel_size (int): Degree of pipeline model parallelism.
-        pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallel_size (int): Degree of context parallelism to be passed to model_config.
-        expert_model_parallel_size (Optional[int]): Degree of expert parallelism for MoE.
-        expert_tensor_parallel_size (int): Expert tensor parallelism for MoE.
-        sequence_parallel (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        enable_recompute (bool): Whether to enable recompute for memory optimization.
-        account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split.
-        account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration.
-        moe_flex_dispatcher_backend (str | None): Token dispatcher type [deepep, hybridep].
-        pretrained_checkpoint (Optional[str]): Path to pretrained checkpoint.
-        freeze_language_model (bool): Whether to freeze the language model.
-        freeze_vision_model (bool): Whether to freeze the vision model.
-        freeze_vision_projection (bool): Whether to freeze the vision projection.
-        dataset_type (Optional[str]): Type of dataset to use.
-        image_folder (Optional[str]): Path to image folder.
-        tokenizer_model (Optional[str]): Path to tokenizer model.
-        peft (Optional[Union[str, PEFT]]): PEFT configuration (e.g., "lora", "dora", or PEFT object).
-        finetune_lr (Optional[float]): Learning rate override for fine-tuning.
-    Returns:
-        ConfigContainer: Configuration for pre-training.
+        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
     """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    bridge = AutoBridge.from_hf_pretrained(hf_path)
-    model_cfg = bridge.to_megatron_provider(load_weights=False)
-    model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
-    model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
-    model_cfg.pipeline_dtype = pipeline_dtype
-    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
-    model_cfg.context_parallel_size = context_parallel_size
-    model_cfg.expert_model_parallel_size = expert_model_parallel_size
-    model_cfg.expert_tensor_parallel_size = expert_tensor_parallel_size
-    model_cfg.sequence_parallel = sequence_parallel
-    # Freeze options
-    model_cfg.freeze_language_model = freeze_language_model
-    model_cfg.freeze_vision_model = freeze_vision_model
-    model_cfg.freeze_vision_projection = freeze_vision_projection
-
-    apply_flex_dispatcher_backend(model_cfg, moe_flex_dispatcher_backend)
-
-    if precision_config is None:
-        precision_config = bf16_mixed()
-
-    # MoE-specific pipeline split configurations
-    if account_for_embedding_in_pipeline_split:
-        model_cfg.account_for_embedding_in_pipeline_split = True
-    if account_for_loss_in_pipeline_split:
-        model_cfg.account_for_loss_in_pipeline_split = True
-
-    # Add recompute settings for memory optimization (used by some MoE models)
-    if enable_recompute:
-        model_cfg.recompute_granularity = "full"
-        model_cfg.recompute_method = "uniform"
-        model_cfg.recompute_num_layers = 1
-    model_cfg.seq_length = seq_length
-    model_cfg.cross_entropy_fusion_impl = "te"
-
-    # Optimizer and scheduler - use finetune_lr if provided, otherwise use lr
-    effective_lr = finetune_lr if finetune_lr is not None else lr
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters,
-        max_lr=effective_lr,
-        min_lr=min_lr,
-    )
+    cfg = _peft_common_vlm()
 
-    # PEFT config
-    peft_config = _default_peft_config(peft)
-
-    # Determine dataset selection strategy.
-    _processor_model = tokenizer_model or hf_path
-    _dataset_choice = dataset_type or ("mock" if mock else "hf")
-
-    if _dataset_choice == "mock":
-        dataset_cfg: DatasetProvider = MockVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            prompt="Describe this image.",
-            num_workers=1,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-            create_attention_mask=True,
-            pad_to_max_length=True,
-        )
-    elif _dataset_choice == "preloaded":
-        dataset_cfg = PreloadedVLMConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path,
-            valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path,
-            test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path,
-            image_folder=image_folder,
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
-    elif _dataset_choice == "hf":
-        dataset_cfg = HFDatasetConversationProvider(
-            seq_length=seq_length,
-            hf_processor_path=_processor_model,
-            maker_name="make_cord_v2_dataset",
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-        )
-    elif _dataset_choice == "energon":
-        tokenizer = AutoTokenizer.from_pretrained(_processor_model)
-        # Use Qwen3VLProcessor to match the HF flow (which uses AutoProcessor).
-        # This processor accepts both images and videos kwargs.
-        image_processor = Qwen3VLProcessor.from_pretrained(_processor_model)
-
-        dataset_cfg = EnergonProvider(
-            seq_length=seq_length,
-            path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path,
-            micro_batch_size=micro_batch_size,
-            global_batch_size=global_batch_size,
-            num_workers=2,
-            dataloader_type="external",
-            task_encoder=QwenVLTaskEncoder(
-                tokenizer=tokenizer,
-                image_processor=image_processor,
-                max_padding_length=seq_length,
-                min_pixels=200704,
-                max_pixels=1003520,
-            ),
-        )
+    # PEFT scheme
+    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
+        cfg.peft = default_peft_config(peft_scheme)
     else:
-        raise ValueError(
-            f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf', 'energon']."
-        )
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=eval_interval,
-            eval_iters=32,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=False,  # qwen3_vl does not support overlap_grad_reduce=True in current implementation
-            overlap_param_gather=False,  # qwen3_vl does not support overlap_param_gather=True in current implementation
-            average_in_collective=True,  # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For Megatron FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=dataset_cfg,
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(
-            tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer",
-            tokenizer_model=hf_path if not use_null_tokenizer else None,
-            vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None,
-        ),
-        checkpoint=CheckpointConfig(
-            pretrained_checkpoint=pretrained_checkpoint,
-            save_interval=save_interval,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        peft=peft_config,
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
+        cfg.peft = peft_scheme
+
+    # Model configuration
+    hf_path = "Qwen/Qwen3-VL-235B-A22B"
+    cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
+    cfg.model.seq_length = 4096
+
+    # Parallel settings - lower EP for PEFT
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    cfg.model.pipeline_dtype = None
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.expert_model_parallel_size = 16
+    cfg.model.context_parallel_size = 1
+    cfg.model.sequence_parallel = False
+
+    # VLM-specific settings
+    cfg.model.freeze_language_model = False
+    cfg.model.freeze_vision_model = False
+    cfg.model.freeze_vision_projection = False
+
+    # Token dispatcher settings (MoE)
+    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_flex_dispatcher_backend = None
+    cfg.model.moe_hybridep_num_sms = 16
+
+    # Apply flex dispatcher backend (dynamically sets dispatcher based on GPU arch)
+    apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend=None)
+
+    # TE / Transformer implementation
+    cfg.model.transformer_impl = "transformer_engine"
+
+    # CUDA Graph settings
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = "full"
+    cfg.model.cuda_graph_warmup_steps = 3
+
+    # Kernel selections
+    cfg.model.attention_backend = "auto"
+    cfg.model.cross_entropy_loss_fusion = True
+    cfg.model.cross_entropy_fusion_impl = "native"
+
+    # MoE kernel selections
+    cfg.model.moe_router_fusion = False
+    cfg.model.moe_permute_fusion = True
+    cfg.model.moe_grouped_gemm = True
+
+    # Memory saving (disabled by default)
+    cfg.model.recompute_granularity = None
+    cfg.model.recompute_modules = None
+    cfg.model.fine_grained_activation_offloading = False
+    cfg.model.offload_modules = None
+
+    # MoE overlap
+    cfg.model.moe_shared_expert_overlap = False
+
+    # MoE force balance
+    cfg.model.moe_router_force_load_balancing = False
+
+    # MoE FP8 padding
+    cfg.model.moe_router_padding_for_fp8 = False
+
+    # Training config
+    cfg.train.train_iters = 300000
+    cfg.train.global_batch_size = 32
+    cfg.train.micro_batch_size = 2
+    cfg.train.manual_gc = True
+    cfg.train.manual_gc_interval = 100
+    cfg.train.manual_gc_eval = 100
+
+    # Validation config
+    cfg.validation.eval_interval = 500
+    cfg.validation.eval_iters = 32
+
+    # Optimizer - higher LR for PEFT
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=500,
+        lr_decay_iters=300000,
+        max_lr=1e-4,
+        min_lr=3e-5,
     )
+    cfg.optimizer = opt_cfg
+    cfg.scheduler = scheduler_cfg
+
+    # Optimizer precision settings (disabled by default for full precision)
+    cfg.optimizer.use_precision_aware_optimizer = False
+    cfg.optimizer.main_grads_dtype = torch.float32
+    cfg.optimizer.main_params_dtype = torch.float32
+    cfg.optimizer.exp_avg_dtype = torch.float32
+    cfg.optimizer.exp_avg_sq_dtype = torch.float32
+
+    # Dataset configuration
+    cfg.dataset.seq_length = 4096
+    cfg.dataset.hf_processor_path = hf_path
+
+    # DDP settings
+    cfg.ddp.overlap_grad_reduce = False
+    cfg.ddp.overlap_param_gather = False
+    cfg.ddp.check_for_nan_in_grad = True
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = True
+    cfg.ddp.average_in_collective = True
+    cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params"
+
+    # Comm overlap settings (MoE)
+    cfg.comm_overlap = None
+    # cfg.comm_overlap.delay_wgrad_compute = False
+    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
+
+    # FP8 and MXFP8 settings (disabled by default)
+    cfg.mixed_precision = "bf16_mixed"
+    # cfg.mixed_precision.fp8_recipe = None
+    # cfg.mixed_precision.fp8 = False
+    # cfg.mixed_precision.fp8_param_gather = False
+    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
+
+    # Checkpoint config
+    # cfg.checkpoint.save = "path/to/save"
+    # cfg.checkpoint.load = "path/to/load"
+    # Uncomment below to use a pretrained checkpoint
+    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
     return cfg
diff --git a/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py
index c691a30d22..98aa8e2235 100644
--- a/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py
+++ b/tests/functional_tests/recipes/test_gemma3_vl_recipes_finetune.py
@@ -17,7 +17,7 @@
 import pytest
 
 from megatron.bridge.recipes.gemma3_vl.gemma3_vl import (
-    gemma3_vl_4b_finetune_config,
+    gemma3_vl_4b_sft_config,
 )
 from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test
 
@@ -25,8 +25,8 @@
 GEMMA3_VL_FINETUNE_RECIPES = [
     # Small model, only use 2 layers
     (
-        gemma3_vl_4b_finetune_config,
-        "gemma3_vl_4b",
+        gemma3_vl_4b_sft_config,
+        "gemma3_vl_4b_sft",
         {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2},
     ),
 ]
@@ -34,8 +34,8 @@
 GEMMA3_VL_FINETUNE_PACKED_RECIPES = [
     # Small model with packed sequences, only use 2 layers
     (
-        gemma3_vl_4b_finetune_config,
-        "gemma3_vl_4b_packed",
+        gemma3_vl_4b_sft_config,
+        "gemma3_vl_4b_sft_packed",
         {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2},
         {"pack_sequences_in_batch": True},
     ),
diff --git a/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py b/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py
index 00a341f33f..56ca5bb8e4 100644
--- a/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py
+++ b/tests/functional_tests/recipes/test_glm_45v_recipes_finetune.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Functional smoke tests for Ministral 3 recipe configurations."""
-
-from functools import partial
+"""Functional smoke tests for GLM-4.5V recipe configurations."""
 
 import pytest
 
 from megatron.bridge.recipes.glm_vl.glm_45v import (
-    glm_45v_finetune_config,
+    glm_45v_sft_config,
 )
 from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test
 
@@ -27,8 +25,8 @@
 GLM_45V_FINETUNE_RECIPES = [
     # Small model, only use 2 layers for quick functional test
     (
-        partial(glm_45v_finetune_config, peft=None),
-        "glm_45v",
+        glm_45v_sft_config,
+        "glm_45v_sft",
         {
             "tensor_model_parallel_size": 1,
             "pipeline_model_parallel_size": 1,
@@ -46,8 +44,8 @@
 GLM_45V_FINETUNE_PACKED_RECIPES = [
     # Small model with packed sequences, only use 2 layers
     (
-        partial(glm_45v_finetune_config, peft=None),
-        "glm_45v_packed",
+        glm_45v_sft_config,
+        "glm_45v_sft_packed",
         {
             "tensor_model_parallel_size": 1,
             "pipeline_model_parallel_size": 1,
diff --git a/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py b/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py
index fa9dedb4c0..8274d50dc2 100644
--- a/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py
+++ b/tests/functional_tests/recipes/test_ministral3_recipes_finetune.py
@@ -14,12 +14,10 @@
 
 """Functional smoke tests for Ministral 3 recipe configurations."""
 
-from functools import partial
-
 import pytest
 
 from megatron.bridge.recipes.ministral3.ministral3 import (
-    ministral3_3b_finetune_config,
+    ministral3_3b_sft_config,
 )
 from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test
 
@@ -27,8 +25,8 @@
 MINISTRAL3_FINETUNE_RECIPES = [
     # Small model, only use 2 layers for quick functional test
     (
-        partial(ministral3_3b_finetune_config, peft=None),
-        "ministral3_3b",
+        ministral3_3b_sft_config,
+        "ministral3_3b_sft",
         {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2},
     ),
 ]
@@ -36,8 +34,8 @@
 MINISTRAL3_FINETUNE_PACKED_RECIPES = [
     # Small model with packed sequences, only use 2 layers
     (
-        partial(ministral3_3b_finetune_config, peft=None),
-        "ministral3_3b_packed",
+        ministral3_3b_sft_config,
+        "ministral3_3b_sft_packed",
         {"tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "num_layers": 2},
         {"pack_sequences_in_batch": True},
     ),
diff --git a/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py
index 55b76dbf96..1ce299315d 100644
--- a/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py
+++ b/tests/functional_tests/recipes/test_nemotron_vl_recipes_finetune.py
@@ -14,30 +14,20 @@
 
 """Functional smoke tests for Nemotron Nano V2 VL recipe configurations."""
 
-import functools
-
 import pytest
 
-from megatron.bridge.recipes.nemotron_vl import nemotron_nano_v2_vl as nemotron_recipe
+from megatron.bridge.recipes.nemotron_vl.nemotron_nano_v2_vl import (
+    nemotron_nano_v2_vl_12b_sft_config,
+)
 from megatron.bridge.training import llava_step
 from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test
 
 
-def _finetune_wrapper(**kwargs):
-    """Wrapper to adapt Nemotron VL finetune_config to the test runner signature.
-
-    The runner will pass (dir, name, dataset_type=mock) among others; we forward
-    everything to finetune_config and inject a dummy pretrained_checkpoint.
-    """
-    kwargs.setdefault("pretrained_checkpoint", "/tmp/fake_nemotron_vl_ckpt")
-    return nemotron_recipe.nemotron_nano_v2_vl_12b_finetune_config(**kwargs)
-
-
 NEMOTRON_VL_FINETUNE_RECIPES = [
     # Small model, only use 2 layers
     (
-        functools.partial(_finetune_wrapper, hf_model_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"),
-        "nemotron_vl_nano_v2",
+        nemotron_nano_v2_vl_12b_sft_config,
+        "nemotron_vl_nano_v2_sft",
         {
             "num_layers": 3,
             "hybrid_override_pattern": "M*-",
diff --git a/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py
index f652bebe12..08f5f02c18 100644
--- a/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py
+++ b/tests/functional_tests/recipes/test_qwen3_vl_recipes_finetune.py
@@ -27,7 +27,7 @@
 
 import pytest
 
-from megatron.bridge.recipes.qwen_vl.qwen3_vl import qwen3_vl_8b_finetune_config
+from megatron.bridge.recipes.qwen_vl.qwen3_vl import qwen3_vl_8b_sft_config
 from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test
 
 
@@ -36,14 +36,14 @@
     # Qwen3-VL 8B finetune - uses TP=2 for 2-GPU CI
     # Note: deepstack_visual_indexes must have len <= num_layers
     (
-        qwen3_vl_8b_finetune_config,
-        "qwen3_vl_8b_finetune",
+        qwen3_vl_8b_sft_config,
+        "qwen3_vl_8b_sft",
         {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1},
         {"num_layers": 4, "deepstack_visual_indexes": [0, 1, 2]},
     ),
     (
-        qwen3_vl_8b_finetune_config,
-        "qwen3_vl_8b_finetune",
+        qwen3_vl_8b_sft_config,
+        "qwen3_vl_8b_sft",
         {
             "tensor_model_parallel_size": 2,
             "pipeline_model_parallel_size": 1,
@@ -60,8 +60,8 @@
         },
     ),
     (
-        qwen3_vl_8b_finetune_config,
-        "qwen3_vl_8b_finetune",
+        qwen3_vl_8b_sft_config,
+        "qwen3_vl_8b_sft",
         {
             "tensor_model_parallel_size": 2,
             "pipeline_model_parallel_size": 1,
@@ -77,8 +77,8 @@
     # (config_func, recipe_name, parallelism_overrides, model_overrides, dataset_overrides)
     # Qwen3-VL 8B finetune with packed sequences
     (
-        qwen3_vl_8b_finetune_config,
-        "qwen3_vl_8b_finetune_packed",
+        qwen3_vl_8b_sft_config,
+        "qwen3_vl_8b_sft_packed",
         {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1},
         {"num_layers": 4, "deepstack_visual_indexes": [0, 1, 2]},
         {"pack_sequences_in_batch": True},
diff --git a/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py
index 3dd20a924c..3462ccd98f 100644
--- a/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py
+++ b/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py
@@ -16,15 +16,15 @@
 
 import pytest
 
-from megatron.bridge.recipes.qwen_vl.qwen25_vl import qwen25_vl_3b_finetune_config
+from megatron.bridge.recipes.qwen_vl.qwen25_vl import qwen25_vl_3b_sft_config
 from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test
 
 
 QWEN_VL_PRETRAIN_RECIPES = [
-    # (config_func, name, parallelism_overrides)
+    # (config_func, name, parallelism_overrides, model_overrides)
     # Two-GPU TP for local/CI multi-GPU runs
     (
-        qwen25_vl_3b_finetune_config,
+        qwen25_vl_3b_sft_config,
         "qwen25_vl_3b",
         {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1},
         {"num_layers": 2},
@@ -35,7 +35,7 @@
     # (config_func, name, parallelism_overrides, model_overrides, dataset_overrides)
     # Two-GPU TP with packed sequences
     (
-        qwen25_vl_3b_finetune_config,
+        qwen25_vl_3b_sft_config,
         "qwen25_vl_3b_packed",
         {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1},
         {"num_layers": 2},
diff --git a/tests/functional_tests/recipes/utils.py b/tests/functional_tests/recipes/utils.py
index 97dbc6fae9..d62e8b203b 100644
--- a/tests/functional_tests/recipes/utils.py
+++ b/tests/functional_tests/recipes/utils.py
@@ -213,7 +213,8 @@ def run_pretrain_vl_recipe_test(
     megatron.bridge.training.vlm_step.forward_step.
 
     Args:
-        config_func: The recipe's pretrain_config function
+        config_func: The recipe's config function (parameterless API for SFT,
+                     or takes peft_scheme parameter for PEFT)
         recipe_name: Name of the recipe for logging/debugging
         tmp_path: Temporary directory for test outputs
         tensor_model_parallel_size: Override tensor parallelism (None = use recipe default)
@@ -221,6 +222,8 @@ def run_pretrain_vl_recipe_test(
         model_overrides: Optional mapping of model attribute overrides to apply
         dataset_overrides: Optional mapping of dataset attribute overrides to apply
     """
+    from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider
+
     if forward_step_func is None:
         # Import locally to avoid loading VLM stack for non-VL tests
         from megatron.bridge.training.vlm_step import forward_step as vlm_forward_step
@@ -228,13 +231,20 @@ def run_pretrain_vl_recipe_test(
         vlm_forward_step = forward_step_func
 
     initialize_distributed()
-    shared_base_dir = broadcast_path(tmp_path)
+    shared_base_dir = Path(broadcast_path(tmp_path))
 
     try:
-        # Note: qwen_vl recipe config functions do not support 'mock' kwarg
-        config: ConfigContainer = config_func(
-            dir=str(shared_base_dir), name=f"{recipe_name}_functional_test", dataset_type="mock"
-        )
+        # VLM recipe configs use parameterless API - call without arguments
+        config: ConfigContainer = config_func()
+
+        # Set up output directories after instantiation
+        run_output_dir = shared_base_dir / f"{recipe_name}_functional_test"
+        checkpoint_dir = run_output_dir / "checkpoints"
+        tensorboard_dir = run_output_dir / "tb_logs"
+        config.checkpoint.save = str(checkpoint_dir)
+        config.checkpoint.load = str(checkpoint_dir)
+        config.logger.tensorboard_dir = str(tensorboard_dir)
+
         # Keep runs short and consistent across tests
         config.train.train_iters = 10
         config.validation.eval_interval = 5
@@ -245,31 +255,26 @@ def run_pretrain_vl_recipe_test(
         config.scheduler.lr_warmup_iters = 1
         test_seq_length = 1024
         config.model.seq_length = test_seq_length
-        config.dataset.seq_length = test_seq_length
 
-        # Disable pin-memory and worker persistence in tests to avoid
-        # pin-memory device mismatches under torchrun+pytest environments.
-        config.dataset.pin_memory = False
-        config.dataset.num_workers = 0
-        config.dataset.persistent_workers = False
-
-        train_samples_needed = config.train.train_iters * config.train.global_batch_size
-        eval_samples_needed = config.validation.eval_iters * config.train.global_batch_size
-        test_samples_needed = 8
-
-        total_samples = train_samples_needed + eval_samples_needed + test_samples_needed
-
-        # Set dataset split ratios for minimal dataset
-        train_split = train_samples_needed / total_samples
-        valid_split = eval_samples_needed / total_samples
-        test_split = test_samples_needed / total_samples
-
-        config.dataset.split = [train_split, valid_split, test_split]
+        # Get the HF processor path from the original dataset config before replacing
+        hf_processor_path = getattr(config.dataset, "hf_processor_path", None)
+        pack_sequences_in_batch = getattr(config.dataset, "pack_sequences_in_batch", False)
+
+        # Replace the real dataset with a mock dataset provider for tests
+        # MockVLMConversationProvider generates synthetic data and doesn't need a split attribute
+        # since the DatasetBuildContext calculates sample counts from training configuration
+        config.dataset = MockVLMConversationProvider(
+            seq_length=test_seq_length,
+            hf_processor_path=hf_processor_path,
+            pack_sequences_in_batch=pack_sequences_in_batch,
+        )
 
         if tensor_model_parallel_size is not None:
-            config.model.tensor_model_parallel_size = tensor_model_parallel_size
+            if hasattr(config.model, "tensor_model_parallel_size"):
+                config.model.tensor_model_parallel_size = tensor_model_parallel_size
         if pipeline_model_parallel_size is not None:
-            config.model.pipeline_model_parallel_size = pipeline_model_parallel_size
+            if hasattr(config.model, "pipeline_model_parallel_size"):
+                config.model.pipeline_model_parallel_size = pipeline_model_parallel_size
 
         # Apply any model-specific overrides provided by the caller
         if model_overrides:
@@ -281,7 +286,7 @@ def run_pretrain_vl_recipe_test(
             for attribute_name, attribute_value in dataset_overrides.items():
                 setattr(config.dataset, attribute_name, attribute_value)
 
-        if config.dataset.pack_sequences_in_batch:
+        if hasattr(config.dataset, "pack_sequences_in_batch") and config.dataset.pack_sequences_in_batch:
             config.train.micro_batch_size = 2
 
         pretrain(config, vlm_forward_step)
diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py
new file mode 100644
index 0000000000..1bd925b4d0
--- /dev/null
+++ b/tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py
@@ -0,0 +1,457 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Test purpose:
+# - Parametrize over all exported Qwen2.5-VL recipe functions in `megatron.bridge.recipes.qwen_vl.qwen25_vl`.
+# - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O.
+# - Build a config and assert it forms a valid `ConfigContainer`.
+# - Verify dataset provider selection and sanity-check parallelism fields.
+#
+
+import importlib
+from typing import Callable
+
+import pytest
+import torch
+
+
+_qwen25_vl_module = importlib.import_module("megatron.bridge.recipes.qwen_vl.qwen25_vl")
+
+# SFT configs (parameterless)
+_QWEN25_VL_SFT_FUNCS = [
+    _qwen25_vl_module.qwen25_vl_3b_sft_config,
+    _qwen25_vl_module.qwen25_vl_7b_sft_config,
+    _qwen25_vl_module.qwen25_vl_32b_sft_config,
+    _qwen25_vl_module.qwen25_vl_72b_sft_config,
+]
+
+# PEFT configs (take peft_scheme parameter)
+_QWEN25_VL_PEFT_FUNCS = [
+    _qwen25_vl_module.qwen25_vl_3b_peft_config,
+    _qwen25_vl_module.qwen25_vl_7b_peft_config,
+    _qwen25_vl_module.qwen25_vl_32b_peft_config,
+    _qwen25_vl_module.qwen25_vl_72b_peft_config,
+]
+
+
+class _FakeModelCfg:
+    """Fake model configuration for testing."""
+
+    def __init__(self):
+        self.tensor_model_parallel_size = 1
+        self.pipeline_model_parallel_size = 1
+        self.pipeline_dtype = None
+        self.virtual_pipeline_model_parallel_size = None
+        self.context_parallel_size = 1
+        self.sequence_parallel = False
+        self.seq_length = 64
+        self.freeze_language_model = False
+        self.freeze_vision_model = False
+        self.freeze_vision_projection = False
+
+    def finalize(self):
+        return None
+
+
+class _FakeAutoBridge:
+    """Fake AutoBridge for testing."""
+
+    @staticmethod
+    def from_hf_pretrained(hf_path: str):
+        """Mock from_hf_pretrained method."""
+        return _FakeAutoBridge()
+
+    def to_megatron_provider(self, load_weights: bool = False):
+        """Return a fake model config."""
+        return _FakeModelCfg()
+
+
+def _assert_basic_config(cfg):
+    """Assert that a config has all required components."""
+    from megatron.bridge.training.config import ConfigContainer
+
+    assert isinstance(cfg, ConfigContainer)
+    assert cfg.model is not None
+    assert cfg.train is not None
+    assert cfg.optimizer is not None
+    assert cfg.scheduler is not None
+    assert cfg.dataset is not None
+    assert cfg.logger is not None
+    assert cfg.tokenizer is not None
+    assert cfg.checkpoint is not None
+    assert cfg.rng is not None
+
+    assert cfg.train.global_batch_size >= 1
+    assert cfg.train.micro_batch_size >= 1
+    assert cfg.dataset.seq_length >= 1
+
+
+@pytest.mark.parametrize("recipe_func", _QWEN25_VL_SFT_FUNCS)
+def test_each_qwen25_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Qwen2.5-VL SFT recipe function builds a valid configuration."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func()
+
+    _assert_basic_config(cfg)
+
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
+
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
+
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
+
+    assert cfg.peft is None
+
+
+@pytest.mark.parametrize("recipe_func", _QWEN25_VL_PEFT_FUNCS)
+def test_each_qwen25_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Qwen2.5-VL PEFT recipe function builds a valid configuration."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func()
+
+    _assert_basic_config(cfg)
+
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
+
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
+
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
+
+    assert cfg.peft is not None
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
+
+
+@pytest.mark.parametrize("recipe_func", _QWEN25_VL_PEFT_FUNCS)
+@pytest.mark.parametrize("peft_scheme", ["lora", "dora"])
+def test_qwen25_vl_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch):
+    """Test that different PEFT schemes are correctly applied for Qwen2.5-VL models."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func(peft_scheme=peft_scheme)
+
+    _assert_basic_config(cfg)
+
+    assert cfg.peft is not None
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
+
+
+def test_qwen25_vl_3b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 3B SFT has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
+
+
+def test_qwen25_vl_3b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 3B LoRA has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config(peft_scheme="lora")
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+
+    assert cfg.peft is not None
+    assert cfg.peft.dim == 32
+    assert cfg.peft.alpha == 32
+
+
+def test_qwen25_vl_3b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 3B DoRA has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config(peft_scheme="dora")
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+
+    assert cfg.peft is not None
+    assert cfg.peft.dim == 32
+    assert cfg.peft.alpha == 64
+
+
+def test_qwen25_vl_7b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 7B SFT has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_7b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 2
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
+
+
+def test_qwen25_vl_7b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 7B PEFT has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_7b_peft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is not None
+
+
+def test_qwen25_vl_32b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 32B SFT has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_32b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 8
+    assert cfg.model.pipeline_model_parallel_size == 2
+    assert cfg.model.pipeline_dtype == torch.bfloat16
+    assert cfg.peft is None
+
+
+def test_qwen25_vl_32b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 32B PEFT has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_32b_peft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is not None
+
+
+def test_qwen25_vl_72b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 72B SFT has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_72b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 8
+    assert cfg.model.pipeline_model_parallel_size == 4
+    assert cfg.model.pipeline_dtype == torch.bfloat16
+    assert cfg.peft is None
+
+
+def test_qwen25_vl_72b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 72B PEFT has correct default parallelism."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_72b_peft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is not None
+
+
+def test_qwen25_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs use HFDatasetConversationProvider by default."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
+
+
+def test_qwen25_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs use HFDatasetConversationProvider by default."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config()
+
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
+
+
+def test_qwen25_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs have freeze options set to False by default."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_qwen25_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs have freeze options set to False by default."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config()
+
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_qwen25_vl_precision_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that precision config is correctly set."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.mixed_precision == "bf16_mixed"
+
+
+def test_qwen25_vl_ddp_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that DDP config is correctly set for VLMs."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.ddp.overlap_grad_reduce is False
+    assert cfg.ddp.overlap_param_gather is False
+    assert cfg.ddp.check_for_nan_in_grad is True
+    assert cfg.ddp.use_distributed_optimizer is True
+
+
+def test_qwen25_vl_optimizer_precision_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that optimizer precision settings are correctly configured."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.optimizer.use_precision_aware_optimizer is False
+    assert cfg.optimizer.main_grads_dtype == torch.float32
+    assert cfg.optimizer.main_params_dtype == torch.float32
+    assert cfg.optimizer.exp_avg_dtype == torch.float32
+    assert cfg.optimizer.exp_avg_sq_dtype == torch.float32
+
+
+def test_qwen25_vl_training_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that training configuration is correctly set."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.train.train_iters == 300000
+    assert cfg.train.global_batch_size == 32
+    assert cfg.train.micro_batch_size == 2
+    assert cfg.train.manual_gc is True
+    assert cfg.train.manual_gc_interval == 100
+
+
+def test_qwen25_vl_validation_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that validation configuration is correctly set."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.validation.eval_interval == 500
+    assert cfg.validation.eval_iters == 32
+
+
+def test_qwen25_vl_sft_learning_rate(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT has lower learning rate than PEFT."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    sft_cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+    peft_cfg = _qwen25_vl_module.qwen25_vl_3b_peft_config()
+
+    # SFT should have lower LR (5e-6) compared to PEFT (1e-4)
+    assert sft_cfg.optimizer.lr < peft_cfg.optimizer.lr
+
+
+def test_qwen25_vl_kernel_settings(monkeypatch: pytest.MonkeyPatch):
+    """Test that kernel settings are correctly configured."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.attention_backend == "auto"
+    assert cfg.model.cross_entropy_loss_fusion is True
+    assert cfg.model.cross_entropy_fusion_impl == "native"
+
+
+def test_qwen25_vl_cuda_graph_settings(monkeypatch: pytest.MonkeyPatch):
+    """Test that CUDA graph settings are correctly configured."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.cuda_graph_impl == "none"
+    assert cfg.model.cuda_graph_scope == "full"
+    assert cfg.model.cuda_graph_warmup_steps == 3
+
+
+def test_qwen25_vl_transformer_impl(monkeypatch: pytest.MonkeyPatch):
+    """Test that transformer implementation is set correctly."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.transformer_impl == "transformer_engine"
+
+
+def test_qwen25_vl_memory_saving_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that memory saving settings are disabled by default."""
+    monkeypatch.setattr(_qwen25_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen25_vl_module.qwen25_vl_3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    assert cfg.model.recompute_granularity is None
+    assert cfg.model.recompute_modules is None
+    assert cfg.model.fine_grained_activation_offloading is False
+    assert cfg.model.offload_modules is None
diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
index 1ba00b910f..0bf6698ec6 100644
--- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
+++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
@@ -12,88 +12,84 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#
+# Test purpose:
+# - Parametrize over all exported Qwen3-VL recipe functions in `megatron.bridge.recipes.qwen_vl.qwen3_vl`.
+# - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O.
+# - Build a config and assert it forms a valid `ConfigContainer`.
+# - Verify dataset provider selection and sanity-check parallelism fields.
+# - Test MoE-specific settings for Qwen3-VL MoE models.
+#
+
 import importlib
-from typing import Callable, List
+from typing import Callable
 
 import pytest
 
 
 _qwen3_vl_module = importlib.import_module("megatron.bridge.recipes.qwen_vl.qwen3_vl")
 
+# SFT configs (parameterless)
+_QWEN3_VL_SFT_FUNCS = [
+    _qwen3_vl_module.qwen3_vl_8b_sft_config,
+    _qwen3_vl_module.qwen3_vl_30b_a3b_sft_config,
+    _qwen3_vl_module.qwen3_vl_235b_a22b_sft_config,
+]
 
-def _collect_recipe_functions(mod) -> List[Callable]:
-    # Prefer explicit exports
-    exported_names = getattr(mod, "__all__", None)
-    candidates: List[Callable] = []
-
-    if exported_names:
-        for name in exported_names:
-            fn = getattr(mod, name, None)
-            if callable(fn) and (name.endswith("_config") or "qwen3" in name.lower() or "qwen" in name.lower()):
-                candidates.append(fn)
-    else:
-        # Fallback: discover by convention
-        for name in dir(mod):
-            if name.startswith("_"):
-                continue
-            fn = getattr(mod, name, None)
-            if callable(fn) and name.endswith("_config"):
-                candidates.append(fn)
-
-    # De-dupe while preserving order
-    seen = set()
-    unique = []
-    for fn in candidates:
-        if fn.__name__ not in seen:
-            unique.append(fn)
-            seen.add(fn.__name__)
-    return unique
-
-
-_QWEN3_VL_RECIPE_FUNCS: List[Callable] = _collect_recipe_functions(_qwen3_vl_module)
-
-
-def _safe_overrides_for(name: str) -> dict:
-    overrides = {
-        "name": f"unit_{name}",
-        "dir": ".",
-        "train_iters": 5,
-        "micro_batch_size": 1,
-        "seq_length": 64,
-        "min_lr": 1e-5,
-        "lr_warmup_iters": 2,
-        "mock": True,
-        "lr": 1e-4,
-        "use_null_tokenizer": True,
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "context_parallel_size": 1,
-    }
-
-    return overrides
+# PEFT configs (take peft_scheme parameter)
+_QWEN3_VL_PEFT_FUNCS = [
+    _qwen3_vl_module.qwen3_vl_8b_peft_config,
+    _qwen3_vl_module.qwen3_vl_30b_a3b_peft_config,
+    _qwen3_vl_module.qwen3_vl_235b_a22b_peft_config,
+]
 
 
 class _FakeModelCfg:
+    """Fake model configuration for testing."""
+
     def __init__(self):
-        self.cross_entropy_fusion_impl = "te"
+        # Set default attributes that recipes might set
+        self.tensor_model_parallel_size = 1
+        self.pipeline_model_parallel_size = 1
+        self.pipeline_dtype = None
+        self.virtual_pipeline_model_parallel_size = None
+        self.context_parallel_size = 1
+        self.expert_model_parallel_size = 1
+        self.sequence_parallel = False
+        self.seq_length = 64
+        self.freeze_language_model = False
+        self.freeze_vision_model = False
+        self.freeze_vision_projection = False
+        # MoE-specific
+        self.moe_token_dispatcher_type = None
+        self.moe_flex_dispatcher_backend = None
+        self.moe_hybridep_num_sms = None
+        self.moe_router_fusion = False
+        self.moe_permute_fusion = False
+        self.moe_grouped_gemm = False
+        self.moe_router_padding_for_fp8 = False
+        self.moe_shared_expert_overlap = False
+        self.moe_router_force_load_balancing = False
 
     def finalize(self):
         return None
 
 
-class _FakeBridge:
-    def __init__(self):
-        pass
+class _FakeAutoBridge:
+    """Fake AutoBridge for testing."""
+
+    @staticmethod
+    def from_hf_pretrained(hf_path: str):
+        """Mock from_hf_pretrained method."""
+        return _FakeAutoBridge()
 
     def to_megatron_provider(self, load_weights: bool = False):
+        """Return a fake model config."""
         return _FakeModelCfg()
 
-    @staticmethod
-    def from_hf_pretrained(hf_path: str, **kwargs):
-        return _FakeBridge()
-
 
 def _assert_basic_config(cfg):
+    """Assert that a config has all required components."""
     from megatron.bridge.training.config import ConfigContainer
 
     assert isinstance(cfg, ConfigContainer)
@@ -109,30 +105,342 @@ def _assert_basic_config(cfg):
 
     assert cfg.train.global_batch_size >= 1
     assert cfg.train.micro_batch_size >= 1
+    assert cfg.dataset.seq_length >= 1
+
+
+@pytest.mark.parametrize("recipe_func", _QWEN3_VL_SFT_FUNCS)
+def test_each_qwen3_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Qwen3-VL SFT recipe function builds a valid configuration."""
+    # Monkeypatch AutoBridge to return a fake model config
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func()
 
-    # Different dataset configs may expose length as sequence_length or seq_length;
-    # for multimodal datasets there may be no such attribute. Only assert presence when available.
-    if hasattr(cfg.dataset, "sequence_length"):
-        assert cfg.dataset.sequence_length >= 1
-    elif hasattr(cfg.dataset, "seq_length"):
-        assert cfg.dataset.seq_length >= 1
-    else:
-        assert cfg.dataset is not None
+    _assert_basic_config(cfg)
+
+    # Check that NullTokenizer is used
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
 
+    # Verify parallelism settings
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
 
-@pytest.mark.parametrize("recipe_func", _QWEN3_VL_RECIPE_FUNCS)
-def test_each_qwen3_vl_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
-    # Monkeypatch AutoBridge used inside the recipe module to avoid heavyweight init
-    module_name = recipe_func.__module__
-    mod = importlib.import_module(module_name)
-    monkeypatch.setattr(mod, "AutoBridge", _FakeBridge)
+    # Verify freeze settings are set
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
 
-    overrides = _safe_overrides_for(recipe_func.__name__)
+    # SFT configs should not have PEFT
+    assert cfg.peft is None
 
-    cfg = recipe_func(**overrides)
+
+@pytest.mark.parametrize("recipe_func", _QWEN3_VL_PEFT_FUNCS)
+def test_each_qwen3_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Qwen3-VL PEFT recipe function builds a valid configuration."""
+    # Monkeypatch AutoBridge to return a fake model config
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func()  # Default peft_scheme="lora"
 
     _assert_basic_config(cfg)
 
-    # Minimal sanity checks on parallelism fields being set to sane values
+    # Check that NullTokenizer is used
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
+
+    # Verify parallelism settings
     assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
     assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
+
+    # Verify freeze settings are set
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
+
+    # PEFT configs should have PEFT configured
+    assert cfg.peft is not None
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
+
+
+@pytest.mark.parametrize("recipe_func", _QWEN3_VL_PEFT_FUNCS)
+@pytest.mark.parametrize("peft_scheme", ["lora", "dora"])
+def test_qwen3_vl_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch):
+    """Test that different PEFT schemes are correctly applied for Qwen3-VL models."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func(peft_scheme=peft_scheme)
+
+    _assert_basic_config(cfg)
+
+    # Check PEFT config presence
+    assert cfg.peft is not None
+    # Verify PEFT config has expected attributes
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
+
+
+def test_qwen3_vl_8b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 8B SFT has correct default parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # For full SFT, 8B should use TP=2, PP=1
+    assert cfg.model.tensor_model_parallel_size == 2
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
+
+
+def test_qwen3_vl_8b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 8B LoRA has correct default parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config(peft_scheme="lora")
+
+    _assert_basic_config(cfg)
+
+    # For LoRA, 8B should use TP=1, PP=1
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+
+    # Check PEFT config
+    assert cfg.peft is not None
+    assert cfg.peft.dim == 32
+    assert cfg.peft.alpha == 32
+
+
+def test_qwen3_vl_8b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 8B DoRA has correct default parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config(peft_scheme="dora")
+
+    _assert_basic_config(cfg)
+
+    # For DoRA, 8B should use same parallelism as LoRA
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+
+    # Check PEFT config (DoRA has alpha=64 by default, unlike LoRA's alpha=32)
+    assert cfg.peft is not None
+    assert cfg.peft.dim == 32
+    assert cfg.peft.alpha == 64
+
+
+def test_qwen3_vl_30b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 30B-A3B SFT has correct default parallelism and MoE settings."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_30b_a3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # For full SFT, 30B-A3B should use TP=1, PP=1, EP=8
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
+
+    # Check expert_model_parallel_size for MoE model
+    assert cfg.model.expert_model_parallel_size == 8
+
+
+def test_qwen3_vl_30b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 30B-A3B PEFT has correct default parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_30b_a3b_peft_config()
+
+    _assert_basic_config(cfg)
+
+    # For LoRA, 30B-A3B should use TP=1, PP=1, EP=4
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.model.expert_model_parallel_size == 4
+
+    # Check PEFT config
+    assert cfg.peft is not None
+
+
+def test_qwen3_vl_235b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 235B-A22B SFT has correct default parallelism and MoE settings."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_235b_a22b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # For full SFT, 235B-A22B should use TP=4, PP=1, EP=32
+    assert cfg.model.tensor_model_parallel_size == 4
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
+
+    # Check expert_model_parallel_size for MoE model
+    assert cfg.model.expert_model_parallel_size == 32
+
+
+def test_qwen3_vl_235b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 235B-A22B PEFT has correct default parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_235b_a22b_peft_config()
+
+    _assert_basic_config(cfg)
+
+    # For LoRA, 235B-A22B should use TP=1, PP=1, EP=16
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.model.expert_model_parallel_size == 16
+
+    # Check PEFT config
+    assert cfg.peft is not None
+
+
+def test_qwen3_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs use HFDatasetConversationProvider by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config()
+
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
+
+
+def test_qwen3_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs use HFDatasetConversationProvider by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config()
+
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
+
+
+def test_qwen3_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs have freeze options set to False by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config()
+
+    # Default freeze options should be False for full SFT
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_qwen3_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs have freeze options set to False by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_config()
+
+    # Default freeze options should be False for PEFT
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_qwen3_vl_precision_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that precision config is correctly set."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # Default should be bf16_mixed
+    assert cfg.mixed_precision == "bf16_mixed"
+
+
+def test_qwen3_vl_ddp_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that DDP config is correctly set for VLMs."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # VLMs should have overlap disabled
+    assert cfg.ddp.overlap_grad_reduce is False
+    assert cfg.ddp.overlap_param_gather is False
+    assert cfg.ddp.check_for_nan_in_grad is True
+    assert cfg.ddp.use_distributed_optimizer is True
+
+
+def test_qwen3_vl_moe_settings_30b(monkeypatch: pytest.MonkeyPatch):
+    """Test that MoE-specific settings are correctly configured for 30B-A3B model."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_30b_a3b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # Check MoE-specific settings
+    assert hasattr(cfg.model, "moe_token_dispatcher_type")
+    assert hasattr(cfg.model, "moe_flex_dispatcher_backend")
+    assert hasattr(cfg.model, "moe_hybridep_num_sms")
+    assert hasattr(cfg.model, "moe_router_fusion")
+    assert hasattr(cfg.model, "moe_permute_fusion")
+    assert hasattr(cfg.model, "moe_grouped_gemm")
+    assert hasattr(cfg.model, "moe_router_padding_for_fp8")
+    assert hasattr(cfg.model, "moe_shared_expert_overlap")
+    assert hasattr(cfg.model, "moe_router_force_load_balancing")
+
+
+def test_qwen3_vl_moe_settings_235b(monkeypatch: pytest.MonkeyPatch):
+    """Test that MoE-specific settings are correctly configured for 235B-A22B model."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_235b_a22b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # Check MoE-specific settings
+    assert hasattr(cfg.model, "moe_token_dispatcher_type")
+    assert hasattr(cfg.model, "moe_flex_dispatcher_backend")
+    assert hasattr(cfg.model, "moe_hybridep_num_sms")
+    assert hasattr(cfg.model, "moe_router_fusion")
+    assert hasattr(cfg.model, "moe_permute_fusion")
+    assert hasattr(cfg.model, "moe_grouped_gemm")
+    assert hasattr(cfg.model, "moe_router_padding_for_fp8")
+    assert hasattr(cfg.model, "moe_shared_expert_overlap")
+    assert hasattr(cfg.model, "moe_router_force_load_balancing")
+
+
+def test_qwen3_vl_8b_is_dense_model(monkeypatch: pytest.MonkeyPatch):
+    """Test that 8B is a dense model without MoE-specific parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # 8B should be dense model with EP=1
+    assert cfg.model.expert_model_parallel_size == 1
+
+    # Verify dense model kernel settings
+    assert cfg.model.moe_router_fusion is False
+    assert cfg.model.moe_permute_fusion is False
+    assert cfg.model.moe_grouped_gemm is False
diff --git a/tests/unit_tests/recipes/test_gemma3_vl_recipes.py b/tests/unit_tests/recipes/test_gemma3_vl_recipes.py
index bbe29bb3ce..d9707a634a 100644
--- a/tests/unit_tests/recipes/test_gemma3_vl_recipes.py
+++ b/tests/unit_tests/recipes/test_gemma3_vl_recipes.py
@@ -16,7 +16,7 @@
 # Test purpose:
 # - Parametrize over all exported Gemma3-VL recipe functions in `megatron.bridge.recipes.gemma3_vl.gemma3_vl`.
 # - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O.
-# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`.
+# - Build a config and assert it forms a valid `ConfigContainer`.
 # - Verify dataset provider selection and sanity-check parallelism fields.
 #
 
@@ -28,42 +28,23 @@
 
 
 _gemma3_vl_module = importlib.import_module("megatron.bridge.recipes.gemma3_vl.gemma3_vl")
-_GEMMA3_VL_RECIPE_FUNCS = [
-    _gemma3_vl_module.gemma3_vl_4b_finetune_config,
-    _gemma3_vl_module.gemma3_vl_12b_finetune_config,
-    _gemma3_vl_module.gemma3_vl_27b_finetune_config,
+
+# SFT configs (parameterless)
+_GEMMA3_VL_SFT_FUNCS = [
+    _gemma3_vl_module.gemma3_vl_4b_sft_config,
+    _gemma3_vl_module.gemma3_vl_12b_sft_config,
+    _gemma3_vl_module.gemma3_vl_27b_sft_config,
 ]
 
+# PEFT configs (take peft_scheme parameter)
+_GEMMA3_VL_PEFT_FUNCS = [
+    _gemma3_vl_module.gemma3_vl_4b_peft_config,
+    _gemma3_vl_module.gemma3_vl_12b_peft_config,
+    _gemma3_vl_module.gemma3_vl_27b_peft_config,
+]
 
-def _safe_overrides_for(name: str) -> dict:
-    """Create safe test overrides for a given recipe function name."""
-    overrides = {
-        "name": f"unit_{name}",
-        "dir": ".",
-        "dataset_type": "mock",
-        "train_iters": 10,
-        "global_batch_size": 2,
-        "micro_batch_size": 1,
-        "seq_length": 64,
-        "lr": 1e-4,
-        "min_lr": 1e-5,
-        "lr_warmup_iters": 2,
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "context_parallel_size": 1,
-    }
-
-    # Large models/variants may set additional flags in recipes; keep harmless defaults
-    lname = name.lower()
-    if "12b" in lname or "27b" in lname:
-        overrides.update(
-            {
-                "virtual_pipeline_model_parallel_size": None,
-                "sequence_parallel": True,
-            }
-        )
-
-    return overrides
+# All recipe functions
+_GEMMA3_VL_ALL_FUNCS = _GEMMA3_VL_SFT_FUNCS + _GEMMA3_VL_PEFT_FUNCS
 
 
 class _FakeModelCfg:
@@ -119,15 +100,13 @@ def _assert_basic_config(cfg):
     assert cfg.dataset.seq_length >= 1
 
 
-@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_RECIPE_FUNCS)
-def test_each_gemma3_vl_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
-    """Test that each Gemma3-VL recipe function builds a valid configuration."""
+@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_SFT_FUNCS)
+def test_each_gemma3_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Gemma3-VL SFT recipe function builds a valid configuration."""
     # Monkeypatch AutoBridge to return a fake model config
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for(recipe_func.__name__)
-
-    cfg = recipe_func(**overrides)
+    cfg = recipe_func()
 
     _assert_basic_config(cfg)
 
@@ -144,115 +123,78 @@ def test_each_gemma3_vl_recipe_builds_config(recipe_func: Callable, monkeypatch:
     assert hasattr(cfg.model, "freeze_vision_model")
     assert hasattr(cfg.model, "freeze_vision_projection")
 
+    # SFT configs should not have PEFT
+    assert cfg.peft is None
 
-@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"])
-def test_gemma3_vl_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch):
-    """Test that different dataset_type values produce correct dataset providers."""
-    # Monkeypatch AutoBridge
-    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
-
-    overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config")
-    overrides["dataset_type"] = dataset_type
-
-    # For preloaded, we need to provide data paths
-    if dataset_type == "preloaded":
-        overrides["train_data_path"] = ["/fake/train.json"]
-        overrides["valid_data_path"] = ["/fake/valid.json"]
-        overrides["test_data_path"] = ["/fake/test.json"]
-        overrides["image_folder"] = "/fake/images"
-
-    cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides)
-
-    # Check that appropriate dataset provider is used
-    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-    from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider
-    from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider
 
-    if dataset_type == "mock":
-        assert isinstance(cfg.dataset, MockVLMConversationProvider)
-    elif dataset_type == "hf":
-        assert isinstance(cfg.dataset, HFDatasetConversationProvider)
-    elif dataset_type == "preloaded":
-        assert isinstance(cfg.dataset, PreloadedVLMConversationProvider)
+@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_PEFT_FUNCS)
+def test_each_gemma3_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Gemma3-VL PEFT recipe function builds a valid configuration."""
+    # Monkeypatch AutoBridge to return a fake model config
+    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
+    cfg = recipe_func()  # Default peft_scheme="lora"
 
-def test_gemma3_vl_freeze_options(monkeypatch: pytest.MonkeyPatch):
-    """Test that freeze options are correctly passed to the model config."""
-    # Monkeypatch AutoBridge
-    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
+    _assert_basic_config(cfg)
 
-    overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config")
-    overrides["freeze_language_model"] = True
-    overrides["freeze_vision_model"] = True
-    overrides["freeze_vision_projection"] = False
+    # Check that NullTokenizer is used
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
 
-    cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides)
+    # Verify parallelism settings
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
 
-    assert cfg.model.freeze_language_model is True
-    assert cfg.model.freeze_vision_model is True
-    assert cfg.model.freeze_vision_projection is False
+    # Verify freeze settings are set
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
 
+    # PEFT configs should have PEFT configured
+    assert cfg.peft is not None
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
 
-def test_gemma3_vl_27b_pipeline_dtype(monkeypatch: pytest.MonkeyPatch):
-    """Test that 27B model sets pipeline_dtype correctly for full SFT."""
 
+@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_PEFT_FUNCS)
+@pytest.mark.parametrize("peft_scheme", ["lora", "dora"])
+def test_gemma3_vl_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch):
+    """Test that different PEFT schemes are correctly applied for Gemma3-VL models."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config")
-    overrides["peft"] = None  # Full SFT
-
-    cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides)
-
-    # The 27B model should set pipeline_dtype to bfloat16 for full SFT
-    assert cfg.model.pipeline_dtype == torch.bfloat16
+    cfg = recipe_func(peft_scheme=peft_scheme)
 
+    _assert_basic_config(cfg)
 
-# PEFT-specific tests
-_GEMMA3_VL_FINETUNE_FUNCS = [
-    _gemma3_vl_module.gemma3_vl_4b_finetune_config,
-    _gemma3_vl_module.gemma3_vl_12b_finetune_config,
-    _gemma3_vl_module.gemma3_vl_27b_finetune_config,
-]
+    # Check PEFT config presence
+    assert cfg.peft is not None
+    # Verify PEFT config has expected attributes
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
 
 
-@pytest.mark.parametrize("recipe_func", _GEMMA3_VL_FINETUNE_FUNCS)
-@pytest.mark.parametrize("peft", ["lora", "dora", None])
-def test_gemma3_vl_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch):
-    """Test that PEFT and full SFT configurations are correctly applied for Gemma3-VL models."""
+def test_gemma3_vl_4b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 4B SFT has correct default parallelism and learning rate."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for(recipe_func.__name__)
-    overrides["peft"] = peft
-
-    cfg = recipe_func(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # Check PEFT config presence
-    if peft in ["lora", "dora"]:
-        assert cfg.peft is not None
-        # Verify PEFT config has expected attributes
-        assert hasattr(cfg.peft, "dim")
-        assert hasattr(cfg.peft, "alpha")
-    elif peft is None:
-        assert cfg.peft is None
+    # For full SFT, 4B should use TP=1, PP=1
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
 
 
-def test_gemma3_vl_4b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 4B LoRA has correct default parallelism and learning rate."""
+def test_gemma3_vl_4b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 4B LoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config")
-    overrides["peft"] = "lora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-    # Don't override finetune_lr to test default
-
-    cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config(peft_scheme="lora")
 
     _assert_basic_config(cfg)
 
@@ -265,22 +207,13 @@ def test_gemma3_vl_4b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft.dim == 32
     assert cfg.peft.alpha == 32
 
-    # Check that learning rate defaults to 1e-4 for LoRA
-    assert cfg.optimizer.lr == 1e-4
-
 
-def test_gemma3_vl_4b_dora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 4B DoRA has correct default parallelism and learning rate."""
+def test_gemma3_vl_4b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 4B DoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config")
-    overrides["peft"] = "dora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config(peft_scheme="dora")
 
     _assert_basic_config(cfg)
 
@@ -294,42 +227,27 @@ def test_gemma3_vl_4b_dora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft.alpha == 64
 
 
-def test_gemma3_vl_4b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 4B full SFT has correct default parallelism and learning rate."""
+def test_gemma3_vl_12b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 12B SFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config")
-    overrides["peft"] = None
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_12b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # For full SFT, 4B should use TP=1, PP=1
-    assert cfg.model.tensor_model_parallel_size == 1
+    # For full SFT, 12B should use TP=4, PP=1
+    assert cfg.model.tensor_model_parallel_size == 4
     assert cfg.model.pipeline_model_parallel_size == 1
     assert cfg.peft is None
 
-    # Check that learning rate defaults to 5e-6 for full SFT
-    assert cfg.optimizer.lr == 5e-6
-
 
-def test_gemma3_vl_12b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 12B LoRA has correct default parallelism."""
+def test_gemma3_vl_12b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 12B PEFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_12b_finetune_config")
-    overrides["peft"] = "lora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _gemma3_vl_module.gemma3_vl_12b_finetune_config(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_12b_peft_config()
 
     _assert_basic_config(cfg)
 
@@ -341,39 +259,30 @@ def test_gemma3_vl_12b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft is not None
 
 
-def test_gemma3_vl_12b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 12B full SFT has correct default parallelism."""
+def test_gemma3_vl_27b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 27B SFT has correct default parallelism and pipeline_dtype."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_12b_finetune_config")
-    overrides["peft"] = None
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _gemma3_vl_module.gemma3_vl_12b_finetune_config(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_27b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # For full SFT, 12B should use TP=4, PP=1
-    assert cfg.model.tensor_model_parallel_size == 4
-    assert cfg.model.pipeline_model_parallel_size == 1
+    # For full SFT, 27B should use TP=8, PP=2
+    assert cfg.model.tensor_model_parallel_size == 8
+    assert cfg.model.pipeline_model_parallel_size == 2
     assert cfg.peft is None
 
+    # For full SFT, pipeline_dtype should be set to bfloat16
+    assert cfg.model.pipeline_dtype == torch.bfloat16
+
 
-def test_gemma3_vl_27b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 27B LoRA has correct default parallelism."""
+def test_gemma3_vl_27b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 27B PEFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config")
-    overrides["peft"] = "lora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_27b_peft_config()
 
     _assert_basic_config(cfg)
 
@@ -388,43 +297,12 @@ def test_gemma3_vl_27b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.model.pipeline_dtype is None
 
 
-def test_gemma3_vl_27b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 27B full SFT has correct default parallelism."""
-
-    # Monkeypatch AutoBridge
-    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
-
-    overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config")
-    overrides["peft"] = None
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides)
-
-    _assert_basic_config(cfg)
-
-    # For full SFT, 27B should use TP=8, PP=2
-    assert cfg.model.tensor_model_parallel_size == 8
-    assert cfg.model.pipeline_model_parallel_size == 2
-    assert cfg.peft is None
-
-    # For full SFT, pipeline_dtype should be set to bfloat16
-    assert cfg.model.pipeline_dtype == torch.bfloat16
-
-
-def test_gemma3_vl_27b_dora_defaults(monkeypatch: pytest.MonkeyPatch):
+def test_gemma3_vl_27b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch):
     """Test that 27B DoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_27b_finetune_config")
-    overrides["peft"] = "dora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _gemma3_vl_module.gemma3_vl_27b_finetune_config(**overrides)
+    cfg = _gemma3_vl_module.gemma3_vl_27b_peft_config(peft_scheme="dora")
 
     _assert_basic_config(cfg)
 
@@ -439,42 +317,80 @@ def test_gemma3_vl_27b_dora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.model.pipeline_dtype is None
 
 
-def test_gemma3_vl_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch):
-    """Test that custom finetune_lr overrides default learning rate."""
+def test_gemma3_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs use HFDatasetConversationProvider by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config")
-    overrides["peft"] = "lora"
-    overrides["finetune_lr"] = 2e-4  # Custom learning rate
+    cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config()
 
-    cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides)
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
 
-    _assert_basic_config(cfg)
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
 
-    # Check that custom learning rate is used
-    assert cfg.optimizer.lr == 2e-4
 
+def test_gemma3_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs use HFDatasetConversationProvider by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config()
 
-def test_gemma3_vl_peft_with_freeze_options(monkeypatch: pytest.MonkeyPatch):
-    """Test that PEFT can be combined with freeze options."""
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
+
+
+def test_gemma3_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs have freeze options set to False by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("gemma3_vl_4b_finetune_config")
-    overrides["peft"] = "lora"
-    overrides["freeze_language_model"] = True
-    overrides["freeze_vision_model"] = False
-    overrides["freeze_vision_projection"] = True
+    cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config()
 
-    cfg = _gemma3_vl_module.gemma3_vl_4b_finetune_config(**overrides)
+    # Default freeze options should be False for full SFT
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
 
-    _assert_basic_config(cfg)
 
-    # Check PEFT config
-    assert cfg.peft is not None
+def test_gemma3_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs have freeze options set to False by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _gemma3_vl_module.gemma3_vl_4b_peft_config()
 
-    # Check freeze options
-    assert cfg.model.freeze_language_model is True
+    # Default freeze options should be False for PEFT
+    assert cfg.model.freeze_language_model is False
     assert cfg.model.freeze_vision_model is False
-    assert cfg.model.freeze_vision_projection is True
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_gemma3_vl_precision_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that precision config is correctly set."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # Default should be bf16_mixed
+    assert cfg.mixed_precision == "bf16_mixed"
+
+
+def test_gemma3_vl_ddp_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that DDP config is correctly set for VLMs."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_gemma3_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _gemma3_vl_module.gemma3_vl_4b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # VLMs should have overlap disabled
+    assert cfg.ddp.overlap_grad_reduce is False
+    assert cfg.ddp.overlap_param_gather is False
+    assert cfg.ddp.check_for_nan_in_grad is True
+    assert cfg.ddp.use_distributed_optimizer is True
diff --git a/tests/unit_tests/recipes/test_glm_45v_recipes.py b/tests/unit_tests/recipes/test_glm_45v_recipes.py
index d60d6636c2..c459289aaf 100644
--- a/tests/unit_tests/recipes/test_glm_45v_recipes.py
+++ b/tests/unit_tests/recipes/test_glm_45v_recipes.py
@@ -16,9 +16,9 @@
 # Test purpose:
 # - Parametrize over all exported GLM-4.5V recipe functions in `megatron.bridge.recipes.glm_vl.glm_45v`.
 # - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O.
-# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`.
+# - Build a config and assert it forms a valid `ConfigContainer`.
 # - Verify dataset provider selection and sanity-check parallelism fields.
-# - Test pipeline model parallel layout for asymmetric stages.
+# - Test MoE-specific settings for this MoE VLM model.
 #
 
 import importlib
@@ -28,33 +28,19 @@
 
 
 _glm_45v_module = importlib.import_module("megatron.bridge.recipes.glm_vl.glm_45v")
-_GLM_45V_RECIPE_FUNCS = [
-    _glm_45v_module.glm_45v_finetune_config,
+
+# SFT configs (parameterless)
+_GLM_45V_SFT_FUNCS = [
+    _glm_45v_module.glm_45v_sft_config,
 ]
 
+# PEFT configs (take peft_scheme parameter)
+_GLM_45V_PEFT_FUNCS = [
+    _glm_45v_module.glm_45v_peft_config,
+]
 
-def _safe_overrides_for(name: str) -> dict:
-    """Create safe test overrides for a given recipe function name."""
-    overrides = {
-        "name": f"unit_{name}",
-        "dir": ".",
-        "dataset_type": "mock",
-        "train_iters": 10,
-        "global_batch_size": 2,
-        "micro_batch_size": 1,
-        "seq_length": 64,
-        "finetune_lr": 1e-4,
-        "min_lr": 1e-5,
-        "lr_warmup_iters": 2,
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "expert_model_parallel_size": 1,
-        "context_parallel_size": 1,
-        "sequence_parallel": False,
-        "virtual_pipeline_model_parallel_size": None,
-    }
-
-    return overrides
+# All recipe functions
+_GLM_45V_ALL_FUNCS = _GLM_45V_SFT_FUNCS + _GLM_45V_PEFT_FUNCS
 
 
 class _FakeModelCfg:
@@ -66,19 +52,23 @@ def __init__(self):
         self.pipeline_model_parallel_size = 1
         self.pipeline_dtype = None
         self.virtual_pipeline_model_parallel_size = None
-        self.expert_model_parallel_size = 1
         self.context_parallel_size = 1
+        self.expert_model_parallel_size = 1
         self.sequence_parallel = False
         self.seq_length = 64
         self.freeze_language_model = False
         self.freeze_vision_model = False
         self.freeze_vision_projection = False
-        # Pipeline layout attributes
-        self.pipeline_model_parallel_layout = None
-        self.account_for_embedding_in_pipeline_split = True
-        self.account_for_loss_in_pipeline_split = True
-        self.num_layers_in_first_pipeline_stage = None
-        self.num_layers_in_last_pipeline_stage = None
+        # MoE-specific
+        self.moe_token_dispatcher_type = None
+        self.moe_flex_dispatcher_backend = None
+        self.moe_hybridep_num_sms = None
+        self.moe_router_fusion = False
+        self.moe_permute_fusion = False
+        self.moe_grouped_gemm = False
+        self.moe_router_padding_for_fp8 = False
+        self.moe_shared_expert_overlap = False
+        self.moe_router_force_load_balancing = False
 
     def finalize(self):
         return None
@@ -117,15 +107,13 @@ def _assert_basic_config(cfg):
     assert cfg.dataset.seq_length >= 1
 
 
-@pytest.mark.parametrize("recipe_func", _GLM_45V_RECIPE_FUNCS)
-def test_each_glm_45v_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
-    """Test that each GLM-4.5V recipe function builds a valid configuration."""
+@pytest.mark.parametrize("recipe_func", _GLM_45V_SFT_FUNCS)
+def test_each_glm_45v_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each GLM-4.5V SFT recipe function builds a valid configuration."""
     # Monkeypatch AutoBridge to return a fake model config
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for(recipe_func.__name__)
-
-    cfg = recipe_func(**overrides)
+    cfg = recipe_func()
 
     _assert_basic_config(cfg)
 
@@ -136,156 +124,111 @@ def test_each_glm_45v_recipe_builds_config(recipe_func: Callable, monkeypatch: p
     # Verify parallelism settings
     assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
     assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
-    assert getattr(cfg.model, "expert_model_parallel_size", 1) >= 1
 
     # Verify freeze settings are set
     assert hasattr(cfg.model, "freeze_language_model")
     assert hasattr(cfg.model, "freeze_vision_model")
     assert hasattr(cfg.model, "freeze_vision_projection")
 
+    # SFT configs should not have PEFT
+    assert cfg.peft is None
 
-@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"])
-def test_glm_45v_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch):
-    """Test that different dataset_type values produce correct dataset providers."""
-    # Monkeypatch AutoBridge
-    monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
-
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["dataset_type"] = dataset_type
-
-    # For preloaded, we need to provide data paths
-    if dataset_type == "preloaded":
-        overrides["train_data_path"] = ["/fake/train.json"]
-        overrides["valid_data_path"] = ["/fake/valid.json"]
-        overrides["test_data_path"] = ["/fake/test.json"]
-        overrides["image_folder"] = "/fake/images"
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
 
-    # Check that appropriate dataset provider is used
-    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-    from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider
-    from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider
+@pytest.mark.parametrize("recipe_func", _GLM_45V_PEFT_FUNCS)
+def test_each_glm_45v_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each GLM-4.5V PEFT recipe function builds a valid configuration."""
+    # Monkeypatch AutoBridge to return a fake model config
+    monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    if dataset_type == "mock":
-        assert isinstance(cfg.dataset, MockVLMConversationProvider)
-    elif dataset_type == "hf":
-        assert isinstance(cfg.dataset, HFDatasetConversationProvider)
-    elif dataset_type == "preloaded":
-        assert isinstance(cfg.dataset, PreloadedVLMConversationProvider)
+    cfg = recipe_func()  # Default peft_scheme="lora"
 
+    _assert_basic_config(cfg)
 
-def test_glm_45v_freeze_options(monkeypatch: pytest.MonkeyPatch):
-    """Test that freeze options are correctly passed to the model config."""
-    # Monkeypatch AutoBridge
-    monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
+    # Check that NullTokenizer is used
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["freeze_language_model"] = True
-    overrides["freeze_vision_model"] = True
-    overrides["freeze_vision_projection"] = False
+    # Verify parallelism settings
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
 
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    # Verify freeze settings are set
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
 
-    assert cfg.model.freeze_language_model is True
-    assert cfg.model.freeze_vision_model is True
-    assert cfg.model.freeze_vision_projection is False
+    # PEFT configs should have PEFT configured
+    assert cfg.peft is not None
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
 
 
-def test_glm_45v_invalid_dataset_type(monkeypatch: pytest.MonkeyPatch):
-    """Test that invalid dataset_type raises ValueError."""
+@pytest.mark.parametrize("peft_scheme", ["lora", "dora"])
+def test_glm_45v_peft_schemes(peft_scheme: str, monkeypatch: pytest.MonkeyPatch):
+    """Test that different PEFT schemes are correctly applied for GLM-4.5V model."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["dataset_type"] = "invalid_type"
-
-    with pytest.raises(ValueError, match="Unsupported dataset_type"):
-        _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_peft_config(peft_scheme=peft_scheme)
 
+    _assert_basic_config(cfg)
 
-# PEFT-specific tests
-_GLM_45V_FINETUNE_FUNCS = [
-    _glm_45v_module.glm_45v_finetune_config,
-]
+    # Check PEFT config presence
+    assert cfg.peft is not None
+    # Verify PEFT config has expected attributes
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
 
 
-@pytest.mark.parametrize("recipe_func", _GLM_45V_FINETUNE_FUNCS)
-@pytest.mark.parametrize("peft", ["lora", "dora", None])
-def test_glm_45v_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch):
-    """Test that PEFT and full SFT configurations are correctly applied for GLM-4.5V models."""
+def test_glm_45v_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that GLM-4.5V SFT has correct default parallelism and MoE settings."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for(recipe_func.__name__)
-    overrides["peft"] = peft
-
-    cfg = recipe_func(**overrides)
+    cfg = _glm_45v_module.glm_45v_sft_config()
 
     _assert_basic_config(cfg)
 
-    # Check PEFT config presence
-    if peft in ["lora", "dora"]:
-        assert cfg.peft is not None
-        # Verify PEFT config has expected attributes
-        assert hasattr(cfg.peft, "dim")
-        assert hasattr(cfg.peft, "alpha")
-    elif peft is None:
-        assert cfg.peft is None
+    # For full SFT, GLM-4.5V should use TP=1, PP=8, EP=16
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 8
+    assert cfg.peft is None
+
+    # Check expert_model_parallel_size for MoE model
+    assert cfg.model.expert_model_parallel_size == 16
 
 
-def test_glm_45v_lora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that GLM-4.5V LoRA has correct default parallelism and learning rate."""
+def test_glm_45v_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that GLM-4.5V LoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["peft"] = "lora"
-    # Remove parallelism overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-    overrides.pop("expert_model_parallel_size", None)
-    # Remove finetune_lr to test default
-    overrides.pop("finetune_lr", None)
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_peft_config(peft_scheme="lora")
 
     _assert_basic_config(cfg)
 
-    # For LoRA, GLM-4.5V should use TP=1, PP=8, EP=4
+    # For LoRA, GLM-4.5V should use TP=1, PP=8
     assert cfg.model.tensor_model_parallel_size == 1
     assert cfg.model.pipeline_model_parallel_size == 8
-    assert cfg.model.expert_model_parallel_size == 4
 
     # Check PEFT config
     assert cfg.peft is not None
     assert cfg.peft.dim == 32
     assert cfg.peft.alpha == 32
 
-    # Check that learning rate defaults to 1e-4 for LoRA
-    assert cfg.optimizer.lr == 1e-4
-
 
-def test_glm_45v_dora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that GLM-4.5V DoRA has correct default parallelism and learning rate."""
+def test_glm_45v_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that GLM-4.5V DoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["peft"] = "dora"
-    # Remove parallelism overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-    overrides.pop("expert_model_parallel_size", None)
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_peft_config(peft_scheme="dora")
 
     _assert_basic_config(cfg)
 
-    # For DoRA, GLM-4.5V should use same parallelism as LoRA
+    # For DoRA, should use same parallelism as LoRA (TP=1, PP=8)
     assert cfg.model.tensor_model_parallel_size == 1
     assert cfg.model.pipeline_model_parallel_size == 8
-    assert cfg.model.expert_model_parallel_size == 4
 
     # Check PEFT config (DoRA has alpha=64 by default, unlike LoRA's alpha=32)
     assert cfg.peft is not None
@@ -293,216 +236,54 @@ def test_glm_45v_dora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft.alpha == 64
 
 
-def test_glm_45v_full_sft_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that GLM-4.5V full SFT has correct default parallelism and learning rate."""
+def test_glm_45v_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs use HFDatasetConversationProvider by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["peft"] = None
-    # Remove parallelism overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-    overrides.pop("expert_model_parallel_size", None)
-    # Remove finetune_lr to test default
-    overrides.pop("finetune_lr", None)
+    cfg = _glm_45v_module.glm_45v_sft_config()
 
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
-
-    _assert_basic_config(cfg)
-
-    # For full SFT, GLM-4.5V should use TP=1, PP=8, EP=16
-    assert cfg.model.tensor_model_parallel_size == 1
-    assert cfg.model.pipeline_model_parallel_size == 8
-    assert cfg.model.expert_model_parallel_size == 16
-    assert cfg.peft is None
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
 
-    # Check that learning rate defaults to 5e-6 for full SFT
-    assert cfg.optimizer.lr == 5e-6
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
 
 
-def test_glm_45v_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch):
-    """Test that custom finetune_lr overrides default learning rate."""
+def test_glm_45v_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs use HFDatasetConversationProvider by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["peft"] = "lora"
-    overrides["finetune_lr"] = 2e-4  # Custom learning rate
+    cfg = _glm_45v_module.glm_45v_peft_config()
 
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
-
-    _assert_basic_config(cfg)
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
 
-    # Check that custom learning rate is used
-    assert cfg.optimizer.lr == 2e-4
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
 
 
-def test_glm_45v_peft_with_freeze_options(monkeypatch: pytest.MonkeyPatch):
-    """Test that PEFT can be combined with freeze options."""
+def test_glm_45v_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs have freeze options set to False by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["peft"] = "lora"
-    overrides["freeze_language_model"] = True
-    overrides["freeze_vision_model"] = False
-    overrides["freeze_vision_projection"] = True
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_sft_config()
 
-    _assert_basic_config(cfg)
-
-    # Check PEFT config
-    assert cfg.peft is not None
-
-    # Check freeze options
-    assert cfg.model.freeze_language_model is True
+    # Default freeze options should be False for full SFT
+    assert cfg.model.freeze_language_model is False
     assert cfg.model.freeze_vision_model is False
-    assert cfg.model.freeze_vision_projection is True
-
-
-# Pipeline layout tests
-
-
-def test_glm_45v_pipeline_layout_pp4():
-    """Test pipeline layout for PP=4."""
-    model_cfg = _FakeModelCfg()
-    model_cfg.pipeline_model_parallel_size = 4
-    model_cfg.virtual_pipeline_model_parallel_size = 1
-
-    _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg)
-
-    # PP=4 should have 4 stages
-    assert model_cfg.pipeline_model_parallel_layout is not None
-    assert len(model_cfg.pipeline_model_parallel_layout) == 4
-    # First stage: embedding + 11 decoder layers
-    assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding"
-    # Last stage should have loss
-    assert "loss" in model_cfg.pipeline_model_parallel_layout[-1]
-
-
-def test_glm_45v_pipeline_layout_pp8():
-    """Test pipeline layout for PP=8."""
-    model_cfg = _FakeModelCfg()
-    model_cfg.pipeline_model_parallel_size = 8
-    model_cfg.virtual_pipeline_model_parallel_size = 1
-
-    _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg)
-
-    # PP=8 should have 8 stages (full SFT layout: embedding+1, then 7*6, then 3+loss)
-    assert model_cfg.pipeline_model_parallel_layout is not None
-    assert len(model_cfg.pipeline_model_parallel_layout) == 8
-    # First stage: embedding + 1 decoder layer
-    assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding"
-    assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 1
-    # Last stage should have loss
-    assert "loss" in model_cfg.pipeline_model_parallel_layout[-1]
-
-
-def test_glm_45v_pipeline_layout_pp16():
-    """Test pipeline layout for PP=16."""
-    model_cfg = _FakeModelCfg()
-    model_cfg.pipeline_model_parallel_size = 16
-    model_cfg.virtual_pipeline_model_parallel_size = 1
-
-    _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg)
-
-    # PP=16 should have 16 stages (full SFT layout: embedding alone, then 3*14, then 3+loss)
-    assert model_cfg.pipeline_model_parallel_layout is not None
-    assert len(model_cfg.pipeline_model_parallel_layout) == 16
-    # First stage: embedding only (no decoder layers, to balance vision encoder cost)
-    assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding"
-    assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 0
-    # Last stage should have loss
-    assert "loss" in model_cfg.pipeline_model_parallel_layout[-1]
-
-
-def test_glm_45v_pipeline_layout_pp8_peft():
-    """Test pipeline layout for PP=8 with PEFT."""
-    model_cfg = _FakeModelCfg()
-    model_cfg.pipeline_model_parallel_size = 8
-    model_cfg.virtual_pipeline_model_parallel_size = 1
-
-    _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg, is_peft=True)
-
-    # PP=8 PEFT layout: embedding+5, then 6*6, then 5+loss
-    assert model_cfg.pipeline_model_parallel_layout is not None
-    assert len(model_cfg.pipeline_model_parallel_layout) == 8
-    # First stage: embedding + 5 decoder layers
-    assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding"
-    assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 5
-    # Last stage should have loss
-    assert "loss" in model_cfg.pipeline_model_parallel_layout[-1]
-
-
-def test_glm_45v_pipeline_layout_pp16_peft():
-    """Test pipeline layout for PP=16 with PEFT."""
-    model_cfg = _FakeModelCfg()
-    model_cfg.pipeline_model_parallel_size = 16
-    model_cfg.virtual_pipeline_model_parallel_size = 1
-
-    _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg, is_peft=True)
-
-    # PP=16 PEFT layout: embedding+2, then 3*14, then 2+loss
-    assert model_cfg.pipeline_model_parallel_layout is not None
-    assert len(model_cfg.pipeline_model_parallel_layout) == 16
-    # First stage: embedding + 2 decoder layers
-    assert model_cfg.pipeline_model_parallel_layout[0][0] == "embedding"
-    assert model_cfg.pipeline_model_parallel_layout[0].count("decoder") == 2
-    # Last stage should have loss
-    assert "loss" in model_cfg.pipeline_model_parallel_layout[-1]
-
-
-def test_glm_45v_pipeline_layout_custom():
-    """Test that custom pipeline layout overrides defaults."""
-    model_cfg = _FakeModelCfg()
-    model_cfg.pipeline_model_parallel_size = 2
-    model_cfg.virtual_pipeline_model_parallel_size = 1
-
-    custom_layout = [["embedding"] + ["decoder"] * 20, ["decoder"] * 26 + ["loss"]]
-    _glm_45v_module.set_glm_45v_pipeline_model_parallel_layout(model_cfg, layout=custom_layout)
-
-    # Custom layout should be used
-    assert model_cfg.pipeline_model_parallel_layout == custom_layout
-
-
-def test_glm_45v_pipeline_layout_in_config(monkeypatch: pytest.MonkeyPatch):
-    """Test that pipeline layout is correctly set in the full config."""
-    # Monkeypatch AutoBridge
-    monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
-
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["pipeline_model_parallel_size"] = 8
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
-
-    _assert_basic_config(cfg)
-
-    # Check that pipeline layout is set
-    assert cfg.model.pipeline_model_parallel_layout is not None
-    # Check that asymmetric pipeline split settings are disabled
-    assert cfg.model.account_for_embedding_in_pipeline_split is False
-    assert cfg.model.account_for_loss_in_pipeline_split is False
+    assert cfg.model.freeze_vision_projection is False
 
 
-def test_glm_45v_wandb_logging(monkeypatch: pytest.MonkeyPatch):
-    """Test that W&B logging options are correctly passed."""
+def test_glm_45v_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs have freeze options set to False by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["wandb_project"] = "test_project"
-    overrides["wandb_entity"] = "test_entity"
-    overrides["wandb_exp_name"] = "test_exp"
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
-
-    _assert_basic_config(cfg)
+    cfg = _glm_45v_module.glm_45v_peft_config()
 
-    assert cfg.logger.wandb_project == "test_project"
-    assert cfg.logger.wandb_entity == "test_entity"
-    assert cfg.logger.wandb_exp_name == "test_exp"
+    # Default freeze options should be False for PEFT
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
 
 
 def test_glm_45v_precision_config(monkeypatch: pytest.MonkeyPatch):
@@ -510,9 +291,7 @@ def test_glm_45v_precision_config(monkeypatch: pytest.MonkeyPatch):
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_sft_config()
 
     _assert_basic_config(cfg)
 
@@ -520,56 +299,64 @@ def test_glm_45v_precision_config(monkeypatch: pytest.MonkeyPatch):
     assert cfg.mixed_precision == "bf16_mixed"
 
 
-def test_glm_45v_peft_none_string(monkeypatch: pytest.MonkeyPatch):
-    """Test that peft='none' (string) is treated as full SFT."""
+def test_glm_45v_ddp_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that DDP config is correctly set for VLMs."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["peft"] = "none"
-    # Remove parallelism overrides to test recipe defaults
-    overrides.pop("expert_model_parallel_size", None)
-    overrides.pop("finetune_lr", None)
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_sft_config()
 
     _assert_basic_config(cfg)
 
-    # peft="none" should be treated as full SFT
-    assert cfg.peft is None
-    # Should use full SFT defaults: EP=16, LR=5e-6
-    assert cfg.model.expert_model_parallel_size == 16
-    assert cfg.optimizer.lr == 5e-6
+    # VLMs should have overlap disabled
+    assert cfg.ddp.overlap_grad_reduce is False
+    assert cfg.ddp.overlap_param_gather is False
+    assert cfg.ddp.check_for_nan_in_grad is True
+    assert cfg.ddp.use_distributed_optimizer is True
 
 
-def test_glm_45v_ddp_config(monkeypatch: pytest.MonkeyPatch):
-    """Test that DDP config is correctly set."""
+def test_glm_45v_moe_settings(monkeypatch: pytest.MonkeyPatch):
+    """Test that MoE-specific settings are correctly configured."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_sft_config()
 
     _assert_basic_config(cfg)
 
-    # Check DDP settings
-    assert cfg.ddp.check_for_nan_in_grad is True
-    assert cfg.ddp.grad_reduce_in_fp32 is True
-    assert cfg.ddp.use_distributed_optimizer is True
-    assert cfg.ddp.data_parallel_sharding_strategy == "optim_grads_params"
+    # Check MoE-specific settings
+    assert hasattr(cfg.model, "moe_token_dispatcher_type")
+    assert hasattr(cfg.model, "moe_flex_dispatcher_backend")
+    assert hasattr(cfg.model, "moe_hybridep_num_sms")
+    assert hasattr(cfg.model, "moe_router_fusion")
+    assert hasattr(cfg.model, "moe_permute_fusion")
+    assert hasattr(cfg.model, "moe_grouped_gemm")
+    assert hasattr(cfg.model, "moe_router_padding_for_fp8")
+    assert hasattr(cfg.model, "moe_shared_expert_overlap")
+    assert hasattr(cfg.model, "moe_router_force_load_balancing")
+
+    # Verify default MoE kernel settings
+    assert cfg.model.moe_router_fusion is False
+    assert cfg.model.moe_permute_fusion is True
+    assert cfg.model.moe_grouped_gemm is True
 
 
-def test_glm_45v_megatron_fsdp(monkeypatch: pytest.MonkeyPatch):
-    """Test that Megatron FSDP option is correctly passed."""
+def test_glm_45v_pipeline_layout_function_exists():
+    """Test that pipeline layout function is exported."""
+    assert hasattr(_glm_45v_module, "set_glm_45v_pipeline_model_parallel_layout")
+    assert callable(_glm_45v_module.set_glm_45v_pipeline_model_parallel_layout)
+
+
+def test_glm_45v_sft_uses_pipeline_layout(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT config has pipeline model parallel layout set."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_glm_45v_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("glm_45v_finetune_config")
-    overrides["use_megatron_fsdp"] = True
-
-    cfg = _glm_45v_module.glm_45v_finetune_config(**overrides)
+    cfg = _glm_45v_module.glm_45v_sft_config()
 
     _assert_basic_config(cfg)
 
-    assert cfg.ddp.use_megatron_fsdp is True
+    # PP should be set when pipeline layout is used
+    assert cfg.model.pipeline_model_parallel_size >= 1
+    # Check if pipeline_model_parallel_layout is set
+    assert hasattr(cfg.model, "pipeline_model_parallel_layout")
diff --git a/tests/unit_tests/recipes/test_ministral3_recipes.py b/tests/unit_tests/recipes/test_ministral3_recipes.py
index 836d87f41b..1b4fc878a8 100644
--- a/tests/unit_tests/recipes/test_ministral3_recipes.py
+++ b/tests/unit_tests/recipes/test_ministral3_recipes.py
@@ -16,7 +16,7 @@
 # Test purpose:
 # - Parametrize over all exported Ministral3 recipe functions in `megatron.bridge.recipes.ministral3.ministral3`.
 # - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O.
-# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`.
+# - Build a config and assert it forms a valid `ConfigContainer`.
 # - Verify dataset provider selection and sanity-check parallelism fields.
 #
 
@@ -27,42 +27,20 @@
 
 
 _ministral3_module = importlib.import_module("megatron.bridge.recipes.ministral3.ministral3")
-_MINISTRAL3_RECIPE_FUNCS = [
-    _ministral3_module.ministral3_3b_finetune_config,
-    _ministral3_module.ministral3_8b_finetune_config,
-    _ministral3_module.ministral3_14b_finetune_config,
-]
 
+# SFT configs (parameterless)
+_MINISTRAL3_SFT_FUNCS = [
+    _ministral3_module.ministral3_3b_sft_config,
+    _ministral3_module.ministral3_8b_sft_config,
+    _ministral3_module.ministral3_14b_sft_config,
+]
 
-def _safe_overrides_for(name: str) -> dict:
-    """Create safe test overrides for a given recipe function name."""
-    overrides = {
-        "name": f"unit_{name}",
-        "dir": ".",
-        "dataset_type": "mock",
-        "train_iters": 10,
-        "global_batch_size": 2,
-        "micro_batch_size": 1,
-        "seq_length": 64,
-        "finetune_lr": 1e-4,
-        "min_lr": 1e-5,
-        "lr_warmup_iters": 2,
-        "tensor_model_parallel_size": 1,
-        "pipeline_model_parallel_size": 1,
-        "context_parallel_size": 1,
-    }
-
-    # Large models may set additional flags in recipes; keep harmless defaults
-    lname = name.lower()
-    if "8b" in lname or "14b" in lname:
-        overrides.update(
-            {
-                "virtual_pipeline_model_parallel_size": None,
-                "sequence_parallel": False,
-            }
-        )
-
-    return overrides
+# PEFT configs (take peft_scheme parameter)
+_MINISTRAL3_PEFT_FUNCS = [
+    _ministral3_module.ministral3_3b_peft_config,
+    _ministral3_module.ministral3_8b_peft_config,
+    _ministral3_module.ministral3_14b_peft_config,
+]
 
 
 class _FakeModelCfg:
@@ -118,15 +96,13 @@ def _assert_basic_config(cfg):
     assert cfg.dataset.seq_length >= 1
 
 
-@pytest.mark.parametrize("recipe_func", _MINISTRAL3_RECIPE_FUNCS)
-def test_each_ministral3_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
-    """Test that each Ministral3 recipe function builds a valid configuration."""
+@pytest.mark.parametrize("recipe_func", _MINISTRAL3_SFT_FUNCS)
+def test_each_ministral3_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Ministral3 SFT recipe function builds a valid configuration."""
     # Monkeypatch AutoBridge to return a fake model config
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for(recipe_func.__name__)
-
-    cfg = recipe_func(**overrides)
+    cfg = recipe_func()
 
     _assert_basic_config(cfg)
 
@@ -143,101 +119,78 @@ def test_each_ministral3_recipe_builds_config(recipe_func: Callable, monkeypatch
     assert hasattr(cfg.model, "freeze_vision_model")
     assert hasattr(cfg.model, "freeze_vision_projection")
 
+    # SFT configs should not have PEFT
+    assert cfg.peft is None
 
-@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"])
-def test_ministral3_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch):
-    """Test that different dataset_type values produce correct dataset providers."""
-    # Monkeypatch AutoBridge
+
+@pytest.mark.parametrize("recipe_func", _MINISTRAL3_PEFT_FUNCS)
+def test_each_ministral3_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Ministral3 PEFT recipe function builds a valid configuration."""
+    # Monkeypatch AutoBridge to return a fake model config
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_3b_finetune_config")
-    overrides["dataset_type"] = dataset_type
+    cfg = recipe_func()  # Default peft_scheme="lora"
 
-    # For preloaded, we need to provide data paths
-    if dataset_type == "preloaded":
-        overrides["train_data_path"] = ["/fake/train.json"]
-        overrides["valid_data_path"] = ["/fake/valid.json"]
-        overrides["test_data_path"] = ["/fake/test.json"]
-        overrides["image_folder"] = "/fake/images"
+    _assert_basic_config(cfg)
+
+    # Check that NullTokenizer is used
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
 
-    cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides)
+    # Verify parallelism settings
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
 
-    # Check that appropriate dataset provider is used
-    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-    from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider
-    from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider
+    # Verify freeze settings are set
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
 
-    if dataset_type == "mock":
-        assert isinstance(cfg.dataset, MockVLMConversationProvider)
-    elif dataset_type == "hf":
-        assert isinstance(cfg.dataset, HFDatasetConversationProvider)
-    elif dataset_type == "preloaded":
-        assert isinstance(cfg.dataset, PreloadedVLMConversationProvider)
+    # PEFT configs should have PEFT configured
+    assert cfg.peft is not None
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
 
 
-def test_ministral3_freeze_options(monkeypatch: pytest.MonkeyPatch):
-    """Test that freeze options are correctly passed to the model config."""
+@pytest.mark.parametrize("recipe_func", _MINISTRAL3_PEFT_FUNCS)
+@pytest.mark.parametrize("peft_scheme", ["lora", "dora"])
+def test_ministral3_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch):
+    """Test that different PEFT schemes are correctly applied for Ministral3 models."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_3b_finetune_config")
-    overrides["freeze_language_model"] = True
-    overrides["freeze_vision_model"] = True
-    overrides["freeze_vision_projection"] = False
-
-    cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides)
-
-    assert cfg.model.freeze_language_model is True
-    assert cfg.model.freeze_vision_model is True
-    assert cfg.model.freeze_vision_projection is False
+    cfg = recipe_func(peft_scheme=peft_scheme)
 
+    _assert_basic_config(cfg)
 
-# PEFT-specific tests
-_MINISTRAL3_FINETUNE_FUNCS = [
-    _ministral3_module.ministral3_3b_finetune_config,
-    _ministral3_module.ministral3_8b_finetune_config,
-    _ministral3_module.ministral3_14b_finetune_config,
-]
+    # Check PEFT config presence
+    assert cfg.peft is not None
+    # Verify PEFT config has expected attributes
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
 
 
-@pytest.mark.parametrize("recipe_func", _MINISTRAL3_FINETUNE_FUNCS)
-@pytest.mark.parametrize("peft", ["lora", "dora", None])
-def test_ministral3_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch):
-    """Test that PEFT and full SFT configurations are correctly applied for Ministral3 models."""
+def test_ministral3_3b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 3B SFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for(recipe_func.__name__)
-    overrides["peft"] = peft
-
-    cfg = recipe_func(**overrides)
+    cfg = _ministral3_module.ministral3_3b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # Check PEFT config presence
-    if peft in ["lora", "dora"]:
-        assert cfg.peft is not None
-        # Verify PEFT config has expected attributes
-        assert hasattr(cfg.peft, "dim")
-        assert hasattr(cfg.peft, "alpha")
-    elif peft is None:
-        assert cfg.peft is None
+    # For full SFT, 3B should use TP=1, PP=1
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
 
 
-def test_ministral3_3b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 3B LoRA has correct default parallelism and learning rate."""
+def test_ministral3_3b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 3B LoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_3b_finetune_config")
-    overrides["peft"] = "lora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-    # Remove finetune_lr to test default
-    overrides.pop("finetune_lr", None)
-
-    cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_3b_peft_config(peft_scheme="lora")
 
     _assert_basic_config(cfg)
 
@@ -250,22 +203,13 @@ def test_ministral3_3b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft.dim == 32
     assert cfg.peft.alpha == 32
 
-    # Check that learning rate defaults to 1e-4 for LoRA
-    assert cfg.optimizer.lr == 1e-4
 
-
-def test_ministral3_3b_dora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 3B DoRA has correct default parallelism and learning rate."""
+def test_ministral3_3b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 3B DoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_3b_finetune_config")
-    overrides["peft"] = "dora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_3b_peft_config(peft_scheme="dora")
 
     _assert_basic_config(cfg)
 
@@ -279,44 +223,27 @@ def test_ministral3_3b_dora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft.alpha == 64
 
 
-def test_ministral3_3b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 3B full SFT has correct default parallelism and learning rate."""
+def test_ministral3_8b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 8B SFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_3b_finetune_config")
-    overrides["peft"] = None
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-    # Remove finetune_lr to test default
-    overrides.pop("finetune_lr", None)
-
-    cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_8b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # For full SFT, 3B should use TP=1, PP=1
-    assert cfg.model.tensor_model_parallel_size == 1
+    # For full SFT, 8B should use TP=2, PP=1
+    assert cfg.model.tensor_model_parallel_size == 2
     assert cfg.model.pipeline_model_parallel_size == 1
     assert cfg.peft is None
 
-    # Check that learning rate defaults to 5e-6 for full SFT
-    assert cfg.optimizer.lr == 5e-6
-
 
-def test_ministral3_8b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 8B LoRA has correct default parallelism."""
+def test_ministral3_8b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 8B PEFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_8b_finetune_config")
-    overrides["peft"] = "lora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _ministral3_module.ministral3_8b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_8b_peft_config()
 
     _assert_basic_config(cfg)
 
@@ -328,39 +255,27 @@ def test_ministral3_8b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft is not None
 
 
-def test_ministral3_8b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 8B full SFT has correct default parallelism."""
+def test_ministral3_14b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 14B SFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_8b_finetune_config")
-    overrides["peft"] = None
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _ministral3_module.ministral3_8b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_14b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # For full SFT, 8B should use TP=2, PP=1
-    assert cfg.model.tensor_model_parallel_size == 2
+    # For full SFT, 14B should use TP=4, PP=1
+    assert cfg.model.tensor_model_parallel_size == 4
     assert cfg.model.pipeline_model_parallel_size == 1
     assert cfg.peft is None
 
 
-def test_ministral3_14b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 14B LoRA has correct default parallelism."""
+def test_ministral3_14b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 14B PEFT has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_14b_finetune_config")
-    overrides["peft"] = "lora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _ministral3_module.ministral3_14b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_14b_peft_config()
 
     _assert_basic_config(cfg)
 
@@ -372,86 +287,85 @@ def test_ministral3_14b_lora_defaults(monkeypatch: pytest.MonkeyPatch):
     assert cfg.peft is not None
 
 
-def test_ministral3_14b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 14B full SFT has correct default parallelism."""
+def test_ministral3_14b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 14B DoRA has correct default parallelism."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_14b_finetune_config")
-    overrides["peft"] = None
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
-
-    cfg = _ministral3_module.ministral3_14b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_14b_peft_config(peft_scheme="dora")
 
     _assert_basic_config(cfg)
 
-    # For full SFT, 14B should use TP=4, PP=1
-    assert cfg.model.tensor_model_parallel_size == 4
+    # For DoRA, 14B should use same parallelism as LoRA (TP=2, PP=1)
+    assert cfg.model.tensor_model_parallel_size == 2
     assert cfg.model.pipeline_model_parallel_size == 1
-    assert cfg.peft is None
 
+    # Check PEFT config
+    assert cfg.peft is not None
 
-def test_ministral3_14b_dora_defaults(monkeypatch: pytest.MonkeyPatch):
-    """Test that 14B DoRA has correct default parallelism."""
+
+def test_ministral3_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs use HFDatasetConversationProvider by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_14b_finetune_config")
-    overrides["peft"] = "dora"
-    # Remove TP/PP overrides to test recipe defaults
-    overrides.pop("tensor_model_parallel_size", None)
-    overrides.pop("pipeline_model_parallel_size", None)
+    cfg = _ministral3_module.ministral3_3b_sft_config()
 
-    cfg = _ministral3_module.ministral3_14b_finetune_config(**overrides)
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
 
-    _assert_basic_config(cfg)
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
 
-    # For DoRA, 14B should use same parallelism as LoRA (TP=2, PP=1)
-    assert cfg.model.tensor_model_parallel_size == 2
-    assert cfg.model.pipeline_model_parallel_size == 1
 
-    # Check PEFT config
-    assert cfg.peft is not None
+def test_ministral3_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs use HFDatasetConversationProvider by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
+    cfg = _ministral3_module.ministral3_3b_peft_config()
 
-def test_ministral3_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch):
-    """Test that custom finetune_lr overrides default learning rate."""
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
+
+
+def test_ministral3_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs have freeze options set to False by default."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_3b_finetune_config")
-    overrides["peft"] = "lora"
-    overrides["finetune_lr"] = 2e-4  # Custom learning rate
+    cfg = _ministral3_module.ministral3_3b_sft_config()
 
-    cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides)
+    # Default freeze options should be False for full SFT
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_ministral3_precision_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that precision config is correctly set."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _ministral3_module.ministral3_3b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # Check that custom learning rate is used
-    assert cfg.optimizer.lr == 2e-4
+    # Default should be bf16_mixed
+    assert cfg.mixed_precision == "bf16_mixed"
 
 
-def test_ministral3_peft_with_freeze_options(monkeypatch: pytest.MonkeyPatch):
-    """Test that PEFT can be combined with freeze options."""
+def test_ministral3_ddp_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that DDP config is correctly set."""
     # Monkeypatch AutoBridge
     monkeypatch.setattr(_ministral3_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides_for("ministral3_3b_finetune_config")
-    overrides["peft"] = "lora"
-    overrides["freeze_language_model"] = True
-    overrides["freeze_vision_model"] = False
-    overrides["freeze_vision_projection"] = True
-
-    cfg = _ministral3_module.ministral3_3b_finetune_config(**overrides)
+    cfg = _ministral3_module.ministral3_3b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # Check PEFT config
-    assert cfg.peft is not None
-
-    # Check freeze options
-    assert cfg.model.freeze_language_model is True
-    assert cfg.model.freeze_vision_model is False
-    assert cfg.model.freeze_vision_projection is True
+    # Check DDP settings
+    assert cfg.ddp.overlap_grad_reduce is False
+    assert cfg.ddp.overlap_param_gather is False
+    assert cfg.ddp.check_for_nan_in_grad is True
+    assert cfg.ddp.use_distributed_optimizer is True
+    assert cfg.ddp.grad_reduce_in_fp32 is True
diff --git a/tests/unit_tests/recipes/test_nemotron_vl_recipes.py b/tests/unit_tests/recipes/test_nemotron_vl_recipes.py
index f7fe85f512..82e1bd1a01 100644
--- a/tests/unit_tests/recipes/test_nemotron_vl_recipes.py
+++ b/tests/unit_tests/recipes/test_nemotron_vl_recipes.py
@@ -12,39 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#
+# Test purpose:
+# - Parametrize over all exported Nemotron VL recipe functions in `megatron.bridge.recipes.nemotron_vl`.
+# - For each recipe, monkeypatch AutoBridge and the provider to avoid I/O.
+# - Build a config and assert it forms a valid `ConfigContainer`.
+# - Verify dataset provider selection and sanity-check parallelism fields.
+#
+
 import importlib
+from typing import Callable
 
 import pytest
-import torch
 
 
-_nemotron_module = importlib.import_module("megatron.bridge.recipes.nemotron_vl.nemotron_nano_v2_vl")
+_nemotron_vl_module = importlib.import_module("megatron.bridge.recipes.nemotron_vl.nemotron_nano_v2_vl")
 
+# SFT configs (parameterless)
+_NEMOTRON_VL_SFT_FUNCS = [
+    _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config,
+]
 
-def _safe_overrides() -> dict:
-    """Create safe test overrides for Nemotron VL recipe functions."""
-    return {
-        "name": "unit_nemotron_vl",
-        "dir": ".",
-        "hf_model_path": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
-        "train_iters": 10,
-        "global_batch_size": 2,
-        "micro_batch_size": 1,
-        "seq_length": 64,
-        "lr": 1e-4,
-        "min_lr": 1e-5,
-        "lr_warmup_iters": 2,
-        "tensor_parallelism": 1,
-        "pipeline_parallelism": 1,
-        "context_parallelism": 1,
-        "sequence_parallelism": False,
-    }
+# PEFT configs (take peft_scheme parameter)
+_NEMOTRON_VL_PEFT_FUNCS = [
+    _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config,
+]
 
 
 class _FakeModelCfg:
     """Fake model configuration for testing."""
 
     def __init__(self):
+        # Set default attributes that recipes might set
         self.tensor_model_parallel_size = 1
         self.pipeline_model_parallel_size = 1
         self.pipeline_dtype = None
@@ -61,17 +60,20 @@ def finalize(self):
 
 
 class _FakeAutoBridge:
-    """Fake AutoBridge for testing to avoid HF downloads and I/O."""
+    """Fake AutoBridge for testing."""
 
     @staticmethod
-    def from_hf_pretrained(hf_path: str, *args, **kwargs):
+    def from_hf_pretrained(hf_path: str, **kwargs):
+        """Mock from_hf_pretrained method."""
         return _FakeAutoBridge()
 
     def to_megatron_provider(self, load_weights: bool = False):
+        """Return a fake model config."""
         return _FakeModelCfg()
 
 
 def _assert_basic_config(cfg):
+    """Assert that a config has all required components."""
     from megatron.bridge.training.config import ConfigContainer
 
     assert isinstance(cfg, ConfigContainer)
@@ -90,104 +92,212 @@ def _assert_basic_config(cfg):
     assert cfg.dataset.seq_length >= 1
 
 
-def test_nemotron_vl_pretrain_builds_config(monkeypatch: pytest.MonkeyPatch):
-    """Test that pretrain_config builds a valid configuration and sets basic fields."""
-    monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge)
+@pytest.mark.parametrize("recipe_func", _NEMOTRON_VL_SFT_FUNCS)
+def test_each_nemotron_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Nemotron VL SFT recipe function builds a valid configuration."""
+    # Monkeypatch AutoBridge to return a fake model config
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func()
+
+    _assert_basic_config(cfg)
+
+    # Check that NullTokenizer is used
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
+
+    # Verify parallelism settings
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
+
+    # Verify freeze settings are set
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
+
+    # SFT configs should not have PEFT
+    assert cfg.peft is None
+
+
+@pytest.mark.parametrize("recipe_func", _NEMOTRON_VL_PEFT_FUNCS)
+def test_each_nemotron_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    """Test that each Nemotron VL PEFT recipe function builds a valid configuration."""
+    # Monkeypatch AutoBridge to return a fake model config
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = recipe_func()  # Default peft_scheme="lora"
+
+    _assert_basic_config(cfg)
+
+    # Check that NullTokenizer is used
+    if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
+
+    # Verify parallelism settings
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
+
+    # Verify freeze settings are set
+    assert hasattr(cfg.model, "freeze_language_model")
+    assert hasattr(cfg.model, "freeze_vision_model")
+    assert hasattr(cfg.model, "freeze_vision_projection")
+
+    # PEFT configs should have PEFT configured
+    assert cfg.peft is not None
+
+
+def test_nemotron_vl_12b_sft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 12B SFT has correct default parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # For full SFT, 12B should use TP=4, PP=1
+    assert cfg.model.tensor_model_parallel_size == 4
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.peft is None
+
+
+def test_nemotron_vl_12b_peft_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that 12B PEFT has correct default parallelism."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides()
-    cfg = _nemotron_module.nemotron_nano_v2_vl_12b_pretrain_config(**overrides)
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config()
 
     _assert_basic_config(cfg)
 
-    # Dataset provider should be HF-based
-    from megatron.bridge.data.vlm_datasets import HFDatasetConversationProvider
+    # For PEFT, 12B should use TP=2, PP=1
+    assert cfg.model.tensor_model_parallel_size == 2
+    assert cfg.model.pipeline_model_parallel_size == 1
+
+    # Check PEFT config (uses VLMLoRA)
+    assert cfg.peft is not None
+
+
+def test_nemotron_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs use HFDatasetConversationProvider by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config()
+
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
 
     assert isinstance(cfg.dataset, HFDatasetConversationProvider)
 
-    # Null tokenizer is used
-    assert getattr(cfg.tokenizer, "tokenizer_type", None) == "NullTokenizer"
 
-    # Parallelism settings should be wired into model cfg
-    assert getattr(cfg.model, "tensor_model_parallel_size", 0) == overrides["tensor_parallelism"]
-    assert getattr(cfg.model, "pipeline_model_parallel_size", 0) == overrides["pipeline_parallelism"]
-    assert getattr(cfg.model, "context_parallel_size", 0) == overrides["context_parallelism"]
-    assert getattr(cfg.model, "sequence_parallel", None) is overrides["sequence_parallelism"]
-    assert getattr(cfg.model, "seq_length", 0) == overrides["seq_length"]
+def test_nemotron_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs use HFDatasetConversationProvider by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config()
 
+    from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
 
-def test_nemotron_vl_pretrain_pipeline_dtype(monkeypatch: pytest.MonkeyPatch):
-    """Test that pipeline_parallelism_dtype is respected."""
-    monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge)
+    assert isinstance(cfg.dataset, HFDatasetConversationProvider)
 
-    overrides = _safe_overrides()
-    overrides["pipeline_parallelism_dtype"] = torch.bfloat16
 
-    cfg = _nemotron_module.nemotron_nano_v2_vl_12b_pretrain_config(**overrides)
+def test_nemotron_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that SFT configs have freeze options set to False by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    assert getattr(cfg.model, "pipeline_dtype", None) is torch.bfloat16
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config()
 
+    # Default freeze options should be False for full SFT
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
 
-def test_nemotron_vl_finetune_with_lora(monkeypatch: pytest.MonkeyPatch):
-    """Test finetune_config wiring including LoRA when enabled."""
-    monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides()
-    cfg = _nemotron_module.nemotron_nano_v2_vl_12b_finetune_config(
-        pretrained_checkpoint="/fake/ckpt",
-        lora_on_language_model=True,
-        lora_on_vision_model=False,
-        **overrides,
-    )
+def test_nemotron_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that PEFT configs have freeze options set to False by default."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config()
+
+    # Default freeze options should be False for PEFT
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_nemotron_vl_precision_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that precision config is correctly set."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config()
 
     _assert_basic_config(cfg)
 
-    # Check that checkpoint wiring includes the pretrained checkpoint
-    assert getattr(cfg.checkpoint, "pretrained_checkpoint", None) == "/fake/ckpt"
+    # Default should be bf16_mixed
+    assert cfg.mixed_precision == "bf16_mixed"
 
-    # LoRA should be configured
-    from megatron.bridge.peft.lora import VLMLoRA
 
-    assert isinstance(getattr(cfg, "peft", None), VLMLoRA)
+def test_nemotron_vl_ddp_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that DDP config is correctly set for VLMs."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
 
-    # Finetune defaults applied (since overrides didn't provide finetune-specific lr)
-    assert hasattr(cfg.optimizer, "lr") and cfg.optimizer.lr == 5e-5
-    assert hasattr(cfg.optimizer, "min_lr") and cfg.optimizer.min_lr == 5e-6
-    assert getattr(cfg.model, "tensor_model_parallel_size", None) == 2
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config()
 
+    _assert_basic_config(cfg)
+
+    # VLMs should have overlap disabled
+    assert cfg.ddp.overlap_grad_reduce is False
+    assert cfg.ddp.overlap_param_gather is False
+    assert cfg.ddp.check_for_nan_in_grad is True
+    assert cfg.ddp.use_distributed_optimizer is True
 
-def test_nemotron_vl_finetune_without_lora(monkeypatch: pytest.MonkeyPatch):
-    """Test finetune_config when LoRA is disabled."""
-    monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides()
-    del overrides["lr"]
-    del overrides["min_lr"]
-    cfg = _nemotron_module.nemotron_nano_v2_vl_12b_finetune_config(
-        pretrained_checkpoint="/fake/ckpt",
-        lora_on_language_model=False,
-        **overrides,
-    )
+def test_nemotron_vl_peft_uses_vlm_lora(monkeypatch: pytest.MonkeyPatch):
+    """Test that Nemotron Nano V2 VL uses VLMLoRA for PEFT."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config()
 
     _assert_basic_config(cfg)
 
-    # No PEFT configured
-    assert getattr(cfg, "peft", None) is None
+    # Check PEFT config is present (should be VLMLoRA)
+    assert cfg.peft is not None
 
-    # Finetune defaults applied when not explicitly provided in overrides
-    assert hasattr(cfg.optimizer, "lr") and cfg.optimizer.lr == 1e-5
-    assert hasattr(cfg.optimizer, "min_lr") and cfg.optimizer.min_lr == 1e-6
+    # Check PEFT type is VLMLoRA
+    from megatron.bridge.peft.lora import VLMLoRA
 
+    assert isinstance(cfg.peft, VLMLoRA)
 
-def test_nemotron_vl_finetune_custom_save_dir(monkeypatch: pytest.MonkeyPatch):
-    """Test that save_checkpoint_dir overrides are respected in finetune_config."""
-    monkeypatch.setattr(_nemotron_module, "AutoBridge", _FakeAutoBridge)
 
-    overrides = _safe_overrides()
-    cfg = _nemotron_module.nemotron_nano_v2_vl_12b_finetune_config(
-        pretrained_checkpoint="/fake/ckpt",
-        save_checkpoint_dir="/fake/save",
-        **overrides,
-    )
+def test_nemotron_vl_sft_training_params(monkeypatch: pytest.MonkeyPatch):
+    """Test that training parameters are correctly set for SFT."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_sft_config()
+
+    _assert_basic_config(cfg)
+
+    # Check training parameters
+    assert cfg.train.train_iters == 2000
+    assert cfg.train.micro_batch_size == 1
+
+
+def test_nemotron_vl_peft_training_params(monkeypatch: pytest.MonkeyPatch):
+    """Test that training parameters are correctly set for PEFT."""
+    # Monkeypatch AutoBridge
+    monkeypatch.setattr(_nemotron_vl_module, "AutoBridge", _FakeAutoBridge)
+
+    cfg = _nemotron_vl_module.nemotron_nano_v2_vl_12b_peft_config()
+
+    _assert_basic_config(cfg)
 
-    assert getattr(cfg.checkpoint, "save", None) == "/fake/save"
-    assert getattr(cfg.checkpoint, "load", None) == "/fake/save"
+    # Check training parameters (should match SFT after update)
+    assert cfg.train.train_iters == 2000
+    assert cfg.train.micro_batch_size == 1