diff --git a/examples/models/vlm/ministral3/README.md b/examples/models/vlm/ministral3/README.md
index 578dd8908b..0c1cd0fcba 100644
--- a/examples/models/vlm/ministral3/README.md
+++ b/examples/models/vlm/ministral3/README.md
@@ -85,9 +85,12 @@ The current date is {today}.
 
 - See: [bridge.recipes.ministral3](../../apidocs/bridge/bridge.recipes.ministral3.md)
 - Available recipes:
-  - `ministral3_3b_finetune_config`: Finetuning for 3B VL model with PEFT support
-  - `ministral3_8b_finetune_config`: Finetuning for 8B VL model with PEFT support
-  - `ministral3_14b_finetune_config`: Finetuning for 14B VL model with PEFT support
+  - `ministral3_3b_sft_config`: Finetuning for 3B VL model
+  - `ministral3_8b_sft_config`: Finetuning for 8B VL model
+  - `ministral3_14b_sft_config`: Finetuning for 14B VL model
+  - `ministral3_3b_peft_config`: Finetuning for 3B VL model with PEFT support
+  - `ministral3_8b_peft_config`: Finetuning for 8B VL model with PEFT support
+  - `ministral3_14b_peft_config`: Finetuning for 14B VL model with PEFT support
 
 Before training, ensure the following environment variables are set:
 1. `SAVE_DIR`: checkpoint and log saving directory
@@ -101,15 +104,11 @@ Pretraining is not verified for this model.
 
 ### Supervised Fine-Tuning (SFT)
 
-See the [sft.sh](sft.sh) script for full parameter fine-tuning with configurable model parallelisms.
-
-W&B report coming soon.
+See the [sft_unpacked.sh](sft_unpacked.sh) script for full parameter fine-tuning with configurable model parallelisms.
 
 ### Parameter-Efficient Fine-Tuning (PEFT) with LoRA
 
-See the [peft.sh](peft.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism.
-
-W&B report coming soon.
+See the [peft_unpacked.sh](peft_unpacked.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism.
 
 ### Recommended Configurations
 
@@ -124,6 +123,9 @@ W&B report coming soon.
 
 **Note:** LoRA/DoRA significantly reduces memory requirements, allowing for larger batch sizes and fewer GPUs.
 
+### Expected Training Dynamics
+We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/h32cflfn) for the expected loss curves and grad norms.
+
 ## Evaluation
 
 Coming soon.
diff --git a/examples/models/vlm/ministral3/peft.sh b/examples/models/vlm/ministral3/peft_unpacked.sh
similarity index 92%
rename from examples/models/vlm/ministral3/peft.sh
rename to examples/models/vlm/ministral3/peft_unpacked.sh
index b3c44a2f86..2f7be5b8a4 100755
--- a/examples/models/vlm/ministral3/peft.sh
+++ b/examples/models/vlm/ministral3/peft_unpacked.sh
@@ -29,12 +29,12 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16
 MODEL_NAME=ministral3_3b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
 MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
-LR=0.0002
-MIN_LR=0.00002
+EVAL_ITERS=20
+LR=0.00005
+MIN_LR=0.000005
 LR_WARMUP_ITERS=10
 LOG_INTERVAL=1
 WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
@@ -47,7 +47,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
     
     echo "Running LoRA finetuning with TP=$TP, PP=$PP"
     uv run --no-sync python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-        --recipe ${MODEL_NAME}_finetune_config \
+        --recipe ${MODEL_NAME}_peft_config \
         --step_func vlm_step \
         --peft_scheme lora \
         checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
@@ -55,7 +55,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
         train.train_iters=$TRAIN_ITERS \
         train.global_batch_size=$GLOBAL_BATCH_SIZE \
         train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
+        validation.eval_iters=$EVAL_ITERS \
         optimizer.lr=$LR \
         optimizer.min_lr=$MIN_LR \
         scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
@@ -65,6 +65,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
         logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_tp${TP}_pp${PP} \
         dataset.maker_name=make_${DATASET_NAME}_dataset \
         dataset.seq_length=$SEQ_LENGTH \
+        dataset.pack_sequences_in_batch=False \
         model.tensor_model_parallel_size=$TP \
         model.pipeline_model_parallel_size=$PP
 done
diff --git a/examples/models/vlm/ministral3/sft.sh b/examples/models/vlm/ministral3/sft_unpacked.sh
similarity index 92%
rename from examples/models/vlm/ministral3/sft.sh
rename to examples/models/vlm/ministral3/sft_unpacked.sh
index a22eebbb03..431f7963e0 100755
--- a/examples/models/vlm/ministral3/sft.sh
+++ b/examples/models/vlm/ministral3/sft_unpacked.sh
@@ -29,12 +29,12 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16
 MODEL_NAME=ministral3_3b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
 MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
-LR=0.00005
-MIN_LR=0.000005
+EVAL_ITERS=20
+LR=0.00001
+MIN_LR=0.000001
 LR_WARMUP_ITERS=10
 LOG_INTERVAL=1
 WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
@@ -47,14 +47,14 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
     
     echo "Running full finetuning with TP=$TP, PP=$PP"
     uv run --no-sync python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-        --recipe ${MODEL_NAME}_finetune_config \
+        --recipe ${MODEL_NAME}_sft_config \
         --step_func vlm_step \
         checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
         model.seq_length=$SEQ_LENGTH \
         train.train_iters=$TRAIN_ITERS \
         train.global_batch_size=$GLOBAL_BATCH_SIZE \
         train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
+        validation.eval_iters=$EVAL_ITERS \
         optimizer.lr=$LR \
         optimizer.min_lr=$MIN_LR \
         scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
@@ -64,6 +64,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
         logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_tp${TP}_pp${PP} \
         dataset.maker_name=make_${DATASET_NAME}_dataset \
         dataset.seq_length=$SEQ_LENGTH \
+        dataset.pack_sequences_in_batch=False \
         model.tensor_model_parallel_size=$TP \
         model.pipeline_model_parallel_size=$PP
 done
diff --git a/examples/models/vlm/qwen3_vl/README.md b/examples/models/vlm/qwen3_vl/README.md
index da9da325a1..c0fb5fc9b7 100644
--- a/examples/models/vlm/qwen3_vl/README.md
+++ b/examples/models/vlm/qwen3_vl/README.md
@@ -106,15 +106,11 @@ Before training, ensure the following environment variables are set:
 See the [sft_unpacked.sh](sft_unpacked.sh) script for full parameter fine-tuning with configurable model parallelisms, with unpacked sequences.
 See the [sft.sh](sft.sh) script for full parameter fine-tuning with sequence-packing.
 
-W&B report coming soon.
-
 ### Parameter-Efficient Fine-Tuning (PEFT) with LoRA
 
 See the [peft_unpacked.sh](peft_unpacked.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism, with unpacked sequences.
 See the [peft.sh](peft.sh) script for LoRA fine-tuning with sequence-packing.
 
-W&B report coming soon.
-
 **Note:** LoRA/DoRA significantly reduces memory requirements, allowing for larger batch sizes and fewer GPUs.
 
 ## Finetuning with Energon Dataset
@@ -129,7 +125,11 @@ field_map:
   conversation: json
 ```
 
-Then, update the dataset path (`dataset.path=/path/to/energon/dataset`) in [sft_energon.sh](sft_energon.sh) and run the script.
+Then, update the dataset path (`dataset.path=/path/to/energon/dataset`) in [peft_energon.sh](peft_energon.sh) and run the script.
+
+
+### Expected Training Dynamics
+We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/lczz4ixx) for the expected loss curves and grad norms.
 
 ## Evaluation
 
diff --git a/examples/models/vlm/qwen3_vl/peft.sh b/examples/models/vlm/qwen3_vl/peft.sh
index de116903e1..80ce6226cd 100755
--- a/examples/models/vlm/qwen3_vl/peft.sh
+++ b/examples/models/vlm/qwen3_vl/peft.sh
@@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct
 MODEL_NAME=qwen3_vl_8b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
 MICRO_BATCH_SIZE=2
-EVAL_ITERS=10
+EVAL_ITERS=20
+EVAL_INTERVAL=20
 LR=0.00005
 MIN_LR=0.000005
 LR_WARMUP_ITERS=10
@@ -45,7 +46,7 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
         IFS=',' read -r EP TP PP CP <<< "$par_config"
         echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP"
         uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-            --recipe ${MODEL_NAME}_finetune_config \
+            --recipe ${MODEL_NAME}_peft_config \
             --step_func qwen3_vl_step \
             --peft_scheme lora \
             checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
@@ -53,7 +54,8 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
             train.train_iters=$TRAIN_ITERS \
             train.global_batch_size=$GLOBAL_BATCH_SIZE \
             train.micro_batch_size=$MICRO_BATCH_SIZE \
-            train.eval_iters=$EVAL_ITERS \
+            validation.eval_iters=$EVAL_ITERS \
+            validation.eval_interval=$EVAL_INTERVAL \
             optimizer.lr=$LR \
             optimizer.min_lr=$MIN_LR \
             scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
@@ -80,10 +82,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct
 MODEL_NAME=qwen3_vl_30b_a3b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
 MICRO_BATCH_SIZE=2
-EVAL_ITERS=10
+EVAL_ITERS=20
+EVAL_INTERVAL=20
 LR=0.00005
 MIN_LR=0.000005
 LR_WARMUP_ITERS=10
@@ -100,7 +103,7 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
         IFS=',' read -r EP TP PP CP <<< "$par_config"
         echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP"
         uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-            --recipe ${MODEL_NAME}_finetune_config \
+            --recipe ${MODEL_NAME}_peft_config \
             --step_func qwen3_vl_step \
             --peft_scheme lora \
             checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
@@ -108,7 +111,8 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
             train.train_iters=$TRAIN_ITERS \
             train.global_batch_size=$GLOBAL_BATCH_SIZE \
             train.micro_batch_size=$MICRO_BATCH_SIZE \
-            train.eval_iters=$EVAL_ITERS \
+            validation.eval_iters=$EVAL_ITERS \
+            validation.eval_interval=$EVAL_INTERVAL \
             optimizer.lr=$LR \
             optimizer.min_lr=$MIN_LR \
             scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
diff --git a/examples/models/vlm/qwen3_vl/sft_energon.sh b/examples/models/vlm/qwen3_vl/peft_energon.sh
similarity index 90%
rename from examples/models/vlm/qwen3_vl/sft_energon.sh
rename to examples/models/vlm/qwen3_vl/peft_energon.sh
index 10188ac86d..dd43fb0242 100755
--- a/examples/models/vlm/qwen3_vl/sft_energon.sh
+++ b/examples/models/vlm/qwen3_vl/peft_energon.sh
@@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct
 MODEL_NAME=qwen3_vl_8b
 DATASET_NAME=energon
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
 MICRO_BATCH_SIZE=2
-EVAL_ITERS=10
+EVAL_ITERS=20
+EVAL_INTERVAL=20
 LR=0.00005
 MIN_LR=0.000005
 LR_WARMUP_ITERS=10
@@ -47,20 +48,20 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
         IFS=',' read -r EP TP PP CP N_PROC <<< "$par_config"
         echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP N_PROC=$N_PROC"
         uv run python -m torch.distributed.run --nproc_per_node=$N_PROC scripts/training/run_recipe.py \
-            --recipe ${MODEL_NAME}_finetune_config \
+            --recipe ${MODEL_NAME}_peft_energon_config \
             --step_func qwen3_vl_step \
             --peft_scheme lora \
-            --dataset_type energon \
             checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
             model.seq_length=$SEQ_LENGTH \
             train.train_iters=$TRAIN_ITERS \
             train.global_batch_size=$GLOBAL_BATCH_SIZE \
             train.micro_batch_size=$MICRO_BATCH_SIZE \
-            train.eval_iters=$EVAL_ITERS \
+            validation.eval_iters=$EVAL_ITERS \
+            validation.eval_interval=$EVAL_INTERVAL \
             optimizer.lr=$LR \
             optimizer.min_lr=$MIN_LR \
             scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
-            checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_seq_pack_${pack_config}_cp${CP} \
+            checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_energon_lora_seq_pack_${pack_config}_cp${CP} \
             logger.log_interval=$LOG_INTERVAL \
             logger.wandb_project=$WANDB_PROJECT \
             logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_seq_pack_${pack_config}_cp${CP} \
diff --git a/examples/models/vlm/qwen3_vl/peft_unpacked.sh b/examples/models/vlm/qwen3_vl/peft_unpacked.sh
index 06c4518935..90419918fa 100755
--- a/examples/models/vlm/qwen3_vl/peft_unpacked.sh
+++ b/examples/models/vlm/qwen3_vl/peft_unpacked.sh
@@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct
 MODEL_NAME=qwen3_vl_8b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
-MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
+MICRO_BATCH_SIZE=2
+EVAL_ITERS=20
+EVAL_INTERVAL=20
 LR=0.00005
 MIN_LR=0.000005
 LR_WARMUP_ITERS=10
@@ -36,14 +37,14 @@ LOG_INTERVAL=1
 WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
 
 # TP/PP combinations: "TP,PP"
-PARALLELISM_CONFIGS=("2,1" "1,2")
+PARALLELISM_CONFIGS=("4,1" "2,1")
 
 for config in "${PARALLELISM_CONFIGS[@]}"; do
     IFS=',' read -r TP PP <<< "$config"
     
     echo "Running LoRA finetuning with TP=$TP, PP=$PP"
-    uv run python -m torch.distributed.run --nproc_per_node=2 scripts/training/run_recipe.py \
-        --recipe ${MODEL_NAME}_finetune_config \
+    uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
+        --recipe ${MODEL_NAME}_peft_config \
         --step_func qwen3_vl_step \
         --peft_scheme lora \
         checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
@@ -51,7 +52,8 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
         train.train_iters=$TRAIN_ITERS \
         train.global_batch_size=$GLOBAL_BATCH_SIZE \
         train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
+        validation.eval_iters=$EVAL_ITERS \
+        validation.eval_interval=$EVAL_INTERVAL \
         optimizer.lr=$LR \
         optimizer.min_lr=$MIN_LR \
         scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
@@ -71,10 +73,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct
 MODEL_NAME=qwen3_vl_30b_a3b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
-MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
+MICRO_BATCH_SIZE=2
+EVAL_ITERS=20
+EVAL_INTERVAL=20
 LR=0.00005
 MIN_LR=0.000005
 LR_WARMUP_ITERS=10
@@ -82,14 +85,14 @@ LOG_INTERVAL=1
 WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
 
 # EP/TP/PP combinations: "EP,TP,PP" configurations
-PARALLELISM_CONFIGS=("8,1,1" "4,1,1" "2,1,1")
+PARALLELISM_CONFIGS=("8,1,1" "4,1,1")
 
 for config in "${PARALLELISM_CONFIGS[@]}"; do
     IFS=',' read -r EP TP PP <<< "$config"
 
     echo "Running LoRA finetuning with EP=$EP, TP=$TP, PP=$PP"
     uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-        --recipe ${MODEL_NAME}_finetune_config \
+        --recipe ${MODEL_NAME}_peft_config \
         --step_func qwen3_vl_step \
         --peft_scheme lora \
         checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
@@ -97,7 +100,8 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
         train.train_iters=$TRAIN_ITERS \
         train.global_batch_size=$GLOBAL_BATCH_SIZE \
         train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
+        validation.eval_iters=$EVAL_ITERS \
+        validation.eval_interval=$EVAL_INTERVAL \
         optimizer.lr=$LR \
         optimizer.min_lr=$MIN_LR \
         scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
diff --git a/examples/models/vlm/qwen3_vl/sft.sh b/examples/models/vlm/qwen3_vl/sft.sh
index 73ea044466..145be3d458 100755
--- a/examples/models/vlm/qwen3_vl/sft.sh
+++ b/examples/models/vlm/qwen3_vl/sft.sh
@@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct
 MODEL_NAME=qwen3_vl_8b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
 MICRO_BATCH_SIZE=2
-EVAL_ITERS=10
+EVAL_ITERS=20
+EVAL_INTERVAL=20
 LR=0.00005
 MIN_LR=0.000005
 LR_WARMUP_ITERS=10
@@ -38,28 +39,29 @@ WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
 SEQ_PACKING_CONFIGS=(True False)
 
 # EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations
-PARALLELISM_CONFIGS=("1,1,1,1" "1,1,1,2" "1,1,1,4")
+PARALLELISM_CONFIGS=("1,2,1,1" "1,2,1,2" "1,2,1,4")
 
 for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
     for par_config in "${PARALLELISM_CONFIGS[@]}"; do
         IFS=',' read -r EP TP PP CP <<< "$par_config"
         echo "Running full finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP"
         uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-            --recipe ${MODEL_NAME}_finetune_config \
+            --recipe ${MODEL_NAME}_sft_config \
             --step_func qwen3_vl_step \
             checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
             model.seq_length=$SEQ_LENGTH \
             train.train_iters=$TRAIN_ITERS \
             train.global_batch_size=$GLOBAL_BATCH_SIZE \
             train.micro_batch_size=$MICRO_BATCH_SIZE \
-            train.eval_iters=$EVAL_ITERS \
+            validation.eval_iters=$EVAL_ITERS \
+            validation.eval_interval=$EVAL_INTERVAL \
             optimizer.lr=$LR \
             optimizer.min_lr=$MIN_LR \
             scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
-            checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_cp${CP} \
+            checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_tp${TP}_cp${CP} \
             logger.log_interval=$LOG_INTERVAL \
             logger.wandb_project=$WANDB_PROJECT \
-            logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_cp${CP} \
+            logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_tp${TP}_cp${CP} \
             dataset.maker_name=make_${DATASET_NAME}_dataset \
             dataset.seq_length=$SEQ_LENGTH \
             dataset.pack_sequences_in_batch=$pack_config \
@@ -73,56 +75,3 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
     done
 done
 
-
-# Test Seq Packing configurations for full finetuning on the MoE model
-PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct
-MODEL_NAME=qwen3_vl_30b_a3b
-DATASET_NAME=cord_v2
-SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
-MICRO_BATCH_SIZE=2
-EVAL_ITERS=10
-LR=0.00005
-MIN_LR=0.000005
-LR_WARMUP_ITERS=10
-LOG_INTERVAL=1
-WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
-
-SEQ_PACKING_CONFIGS=(True False)
-
-# EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations
-PARALLELISM_CONFIGS=("8,1,1,1" "4,1,1,2" "2,1,1,4")
-
-for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
-    for par_config in "${PARALLELISM_CONFIGS[@]}"; do
-        IFS=',' read -r EP TP PP CP <<< "$par_config"
-        echo "Running full finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP"
-        uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-            --recipe ${MODEL_NAME}_finetune_config \
-            --step_func qwen3_vl_step \
-            checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
-            model.seq_length=$SEQ_LENGTH \
-            train.train_iters=$TRAIN_ITERS \
-            train.global_batch_size=$GLOBAL_BATCH_SIZE \
-            train.micro_batch_size=$MICRO_BATCH_SIZE \
-            train.eval_iters=$EVAL_ITERS \
-            optimizer.lr=$LR \
-            optimizer.min_lr=$MIN_LR \
-            scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
-            checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_ep${EP}_cp${CP} \
-            logger.log_interval=$LOG_INTERVAL \
-            logger.wandb_project=$WANDB_PROJECT \
-            logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_ep${EP}_cp${CP} \
-            dataset.maker_name=make_${DATASET_NAME}_dataset \
-            dataset.seq_length=$SEQ_LENGTH \
-            dataset.pack_sequences_in_batch=$pack_config \
-            model.expert_model_parallel_size=$EP \
-            model.tensor_model_parallel_size=$TP \
-            model.pipeline_model_parallel_size=$PP \
-            model.context_parallel_size=$CP \
-            model.calculate_per_token_loss=True \
-            ddp.average_in_collective=False \
-            ddp.grad_reduce_in_fp32=True
-    done
-done
diff --git a/examples/models/vlm/qwen3_vl/sft_unpacked.sh b/examples/models/vlm/qwen3_vl/sft_unpacked.sh
index 510e365d86..b3720dbec5 100755
--- a/examples/models/vlm/qwen3_vl/sft_unpacked.sh
+++ b/examples/models/vlm/qwen3_vl/sft_unpacked.sh
@@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct
 MODEL_NAME=qwen3_vl_8b
 DATASET_NAME=cord_v2
 SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
-MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
+TRAIN_ITERS=100
+GLOBAL_BATCH_SIZE=16
+MICRO_BATCH_SIZE=2
+EVAL_ITERS=20
+EVAL_INTERVAL=20
 LR=0.00005
 MIN_LR=0.000005
 LR_WARMUP_ITERS=10
@@ -36,21 +37,22 @@ LOG_INTERVAL=1
 WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
 
 # TP/PP combinations: "TP,PP"
-PARALLELISM_CONFIGS=("2,1" "1,2")
+PARALLELISM_CONFIGS=("4,1" "2,1")
 
 for config in "${PARALLELISM_CONFIGS[@]}"; do
     IFS=',' read -r TP PP <<< "$config"
     
     echo "Running full finetuning with TP=$TP, PP=$PP"
-    uv run python -m torch.distributed.run --nproc_per_node=2 scripts/training/run_recipe.py \
-        --recipe ${MODEL_NAME}_finetune_config \
+    uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
+        --recipe ${MODEL_NAME}_sft_config \
         --step_func qwen3_vl_step \
         checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
         model.seq_length=$SEQ_LENGTH \
         train.train_iters=$TRAIN_ITERS \
         train.global_batch_size=$GLOBAL_BATCH_SIZE \
         train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
+        validation.eval_iters=$EVAL_ITERS \
+        validation.eval_interval=$EVAL_INTERVAL \
         optimizer.lr=$LR \
         optimizer.min_lr=$MIN_LR \
         scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
@@ -64,49 +66,3 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do
         model.pipeline_model_parallel_size=$PP
 done
 
-
-# Common configurations for MoE model finetuning
-PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct
-MODEL_NAME=qwen3_vl_30b_a3b
-DATASET_NAME=cord_v2
-SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
-MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
-LR=0.00005
-MIN_LR=0.000005
-LR_WARMUP_ITERS=10
-LOG_INTERVAL=1
-WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
-
-# EP/TP/PP/SP combinations: "EP,TP,PP,SP" configurations
-PARALLELISM_CONFIGS=("8,1,1,False" "1,4,2,False" "2,2,2,True")
-
-for config in "${PARALLELISM_CONFIGS[@]}"; do
-    IFS=',' read -r EP TP PP SP <<< "$config"
-
-    echo "Running full finetuning with EP=$EP, TP=$TP, PP=$PP, SP=$SP"
-    uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
-        --recipe ${MODEL_NAME}_finetune_config \
-        --step_func qwen3_vl_step \
-        checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
-        model.seq_length=$SEQ_LENGTH \
-        train.train_iters=$TRAIN_ITERS \
-        train.global_batch_size=$GLOBAL_BATCH_SIZE \
-        train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
-        optimizer.lr=$LR \
-        optimizer.min_lr=$MIN_LR \
-        scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
-        checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_ep${EP}_tp${TP}_pp${PP}_sp_${SP} \
-        logger.log_interval=$LOG_INTERVAL \
-        logger.wandb_project=$WANDB_PROJECT \
-        logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_ep${EP}_tp${TP}_pp${PP}_sp_${SP} \
-        dataset.maker_name=make_${DATASET_NAME}_dataset \
-        dataset.seq_length=$SEQ_LENGTH \
-        model.expert_model_parallel_size=$EP \
-        model.tensor_model_parallel_size=$TP \
-        model.pipeline_model_parallel_size=$PP \
-        model.sequence_parallel=$SP
-done
diff --git a/scripts/training/run_recipe.py b/scripts/training/run_recipe.py
index e927143f35..3c52aad5f4 100755
--- a/scripts/training/run_recipe.py
+++ b/scripts/training/run_recipe.py
@@ -133,12 +133,6 @@ def parse_args() -> tuple[argparse.Namespace, list[str]]:
         default=None,
         help="Sequence length for training",
     )
-    parser.add_argument(
-        "--dataset_type",
-        type=str,
-        default=None,
-        help="Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded').",
-    )
     parser.add_argument(
         "--hf_path",
         type=str,
@@ -155,7 +149,6 @@ def load_recipe(
     peft_scheme: str | None,
     packed_sequence: bool = False,
     seq_length: int | None = None,
-    dataset_type: str | None = None,
     hf_path: str | None = None,
 ) -> ConfigContainer:
     """
@@ -166,7 +159,6 @@ def load_recipe(
         peft_scheme: PEFT scheme to use ('lora', 'dora', or None)
         packed_sequence: Enable packed sequence training (default: False)
         seq_length: Sequence length for training (optional)
-        dataset_type: Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded')
         hf_path: HuggingFace model ID or local path to model directory (optional)
 
     Returns:
@@ -193,14 +185,12 @@ def load_recipe(
         accepts_peft = "peft" in params or has_var_keyword
         accepts_packed_sequence = "packed_sequence" in params or has_var_keyword
         accepts_seq_length = "seq_length" in params or has_var_keyword
-        accepts_dataset_type = "dataset_type" in params or has_var_keyword
         accepts_hf_path = "hf_path" in params or has_var_keyword
     except (ValueError, TypeError):
         # If signature inspection fails, fallback conservatively
         accepts_peft = True  # peft is widely supported, try passing it
         accepts_packed_sequence = False  # new parameter, don't pass if unsure
         accepts_seq_length = False  # new parameter, don't pass if unsure
-        accepts_dataset_type = False  # VLM-specific, don't pass if unsure
         accepts_hf_path = False  # model-specific, don't pass if unsure
 
     # Build kwargs dynamically based on what the recipe accepts
@@ -211,8 +201,6 @@ def load_recipe(
         kwargs["packed_sequence"] = packed_sequence
     if accepts_seq_length and seq_length is not None:
         kwargs["seq_length"] = seq_length
-    if accepts_dataset_type and dataset_type is not None:
-        kwargs["dataset_type"] = dataset_type
     if accepts_hf_path and hf_path is not None:
         kwargs["hf_path"] = hf_path
 
@@ -250,7 +238,6 @@ def main() -> None:
         args.peft_scheme,
         args.packed_sequence,
         args.seq_length,
-        args.dataset_type,
         args.hf_path,
     )
 
diff --git a/src/megatron/bridge/data/energon/energon_provider.py b/src/megatron/bridge/data/energon/energon_provider.py
index ff7cbdd22b..f33ea48dc1 100644
--- a/src/megatron/bridge/data/energon/energon_provider.py
+++ b/src/megatron/bridge/data/energon/energon_provider.py
@@ -37,6 +37,7 @@ class EnergonProvider(DatasetProvider):
     pack_sequences_in_batch: bool = False
 
     def build_datasets(self, context: DatasetBuildContext):
+        assert self.path, "EnergonProvider.path must be set. Use CLI override: dataset.path=<path>"
         dataset = EnergonMultiModalDataModule(
             path=self.path,
             tokenizer=context.tokenizer if context.tokenizer is not None else self.tokenizer,
diff --git a/src/megatron/bridge/recipes/qwen_vl/__init__.py b/src/megatron/bridge/recipes/qwen_vl/__init__.py
index 35ef162de1..cf4f94c0d0 100644
--- a/src/megatron/bridge/recipes/qwen_vl/__init__.py
+++ b/src/megatron/bridge/recipes/qwen_vl/__init__.py
@@ -16,6 +16,7 @@
 # Qwen3-VL models
 from .qwen3_vl import (
     qwen3_vl_8b_peft_config,
+    qwen3_vl_8b_peft_energon_config,
     qwen3_vl_8b_sft_config,
     qwen3_vl_30b_a3b_peft_config,
     qwen3_vl_30b_a3b_sft_config,
@@ -91,6 +92,7 @@
     "qwen3_vl_235b_a22b_sft_config",
     # Qwen3-VL PEFT configs
     "qwen3_vl_8b_peft_config",
+    "qwen3_vl_8b_peft_energon_config",
     "qwen3_vl_30b_a3b_peft_config",
     "qwen3_vl_235b_a22b_peft_config",
 ]
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
index d6b6acda6d..4e26908322 100644
--- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
@@ -18,16 +18,42 @@
 """
 
 import torch
+from transformers import AutoTokenizer, Qwen3VLProcessor
 
 from megatron.bridge import AutoBridge
+from megatron.bridge.data.energon.energon_provider import EnergonProvider
 from megatron.bridge.peft.base import PEFT
 from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
+from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import QwenVLTaskEncoder
 from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
 from megatron.bridge.training.config import ConfigContainer
 from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
 
 
+def _make_energon_dataset(
+    hf_path: str, seq_length: int, micro_batch_size: int, global_batch_size: int
+) -> EnergonProvider:
+    """Create an EnergonProvider dataset config for Qwen3-VL recipes."""
+    tokenizer = AutoTokenizer.from_pretrained(hf_path)
+    # Use Qwen3VLProcessor to match the HF flow (which uses AutoProcessor).
+    # This processor accepts both images and videos kwargs.
+    image_processor = Qwen3VLProcessor.from_pretrained(hf_path)
+    task_encoder = QwenVLTaskEncoder(
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        max_padding_length=seq_length,
+    )
+    return EnergonProvider(
+        path="",  # Must be set via CLI override: dataset.path=<path>
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        global_batch_size=global_batch_size,
+        num_workers=2,
+        task_encoder=task_encoder,
+    )
+
+
 # =============================================================================
 # Qwen3-VL 8B SFT Configuration
 # =============================================================================
@@ -42,7 +68,7 @@ def qwen3_vl_8b_sft_config() -> ConfigContainer:
     cfg = _sft_common_vlm()
 
     # Model configuration
-    hf_path = "Qwen/Qwen3-VL-8B"
+    hf_path = "Qwen/Qwen3-VL-8B-Instruct"
     cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
     cfg.model.seq_length = 4096
 
@@ -177,7 +203,7 @@ def qwen3_vl_30b_a3b_sft_config() -> ConfigContainer:
     cfg = _sft_common_vlm()
 
     # Model configuration
-    hf_path = "Qwen/Qwen3-VL-30B-A3B"
+    hf_path = "Qwen/Qwen3-VL-30B-A3B-Instruct"
     cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
     cfg.model.seq_length = 4096
 
@@ -196,7 +222,7 @@ def qwen3_vl_30b_a3b_sft_config() -> ConfigContainer:
     cfg.model.freeze_vision_projection = False
 
     # Token dispatcher settings (MoE)
-    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_token_dispatcher_type = "alltoall"
     cfg.model.moe_flex_dispatcher_backend = None
     cfg.model.moe_hybridep_num_sms = 16
 
@@ -332,7 +358,7 @@ def qwen3_vl_235b_a22b_sft_config() -> ConfigContainer:
     cfg.model.freeze_vision_projection = False
 
     # Token dispatcher settings (MoE)
-    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_token_dispatcher_type = "alltoall"
     cfg.model.moe_flex_dispatcher_backend = None
     cfg.model.moe_hybridep_num_sms = 16
 
@@ -458,7 +484,7 @@ def qwen3_vl_8b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer
         cfg.peft = peft_scheme
 
     # Model configuration
-    hf_path = "Qwen/Qwen3-VL-8B"
+    hf_path = "Qwen/Qwen3-VL-8B-Instruct"
     cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
     cfg.model.seq_length = 4096
 
@@ -602,7 +628,7 @@ def qwen3_vl_30b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCont
         cfg.peft = peft_scheme
 
     # Model configuration
-    hf_path = "Qwen/Qwen3-VL-30B-A3B"
+    hf_path = "Qwen/Qwen3-VL-30B-A3B-Instruct"
     cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False)
     cfg.model.seq_length = 4096
 
@@ -621,7 +647,7 @@ def qwen3_vl_30b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCont
     cfg.model.freeze_vision_projection = False
 
     # Token dispatcher settings (MoE)
-    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_token_dispatcher_type = "alltoall"
     cfg.model.moe_flex_dispatcher_backend = None
     cfg.model.moe_hybridep_num_sms = 16
 
@@ -766,7 +792,7 @@ def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCo
     cfg.model.freeze_vision_projection = False
 
     # Token dispatcher settings (MoE)
-    cfg.model.moe_token_dispatcher_type = None
+    cfg.model.moe_token_dispatcher_type = "alltoall"
     cfg.model.moe_flex_dispatcher_backend = None
     cfg.model.moe_hybridep_num_sms = 16
 
@@ -867,3 +893,18 @@ def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCo
     # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
 
     return cfg
+
+
+# =============================================================================
+# Qwen3-VL 8B PEFT with Energon Dataset
+# =============================================================================
+def qwen3_vl_8b_peft_energon_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer:
+    """Return a PEFT (LoRA/DoRA) config for Qwen3-VL 8B with Energon dataset.
+
+    Same as qwen3_vl_8b_peft_config but uses EnergonProvider instead of HF dataset.
+    Set the dataset path via CLI override: dataset.path=/path/to/energon/dataset
+    """
+    cfg = qwen3_vl_8b_peft_config(peft_scheme=peft_scheme)
+    hf_path = "Qwen/Qwen3-VL-8B-Instruct"
+    cfg.dataset = _make_energon_dataset(hf_path, 4096, cfg.train.micro_batch_size, cfg.train.global_batch_size)
+    return cfg
diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
index 0bf6698ec6..f1cac58d35 100644
--- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
+++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py
@@ -444,3 +444,122 @@ def test_qwen3_vl_8b_is_dense_model(monkeypatch: pytest.MonkeyPatch):
     assert cfg.model.moe_router_fusion is False
     assert cfg.model.moe_permute_fusion is False
     assert cfg.model.moe_grouped_gemm is False
+
+
+# =============================================================================
+# Qwen3-VL 8B PEFT Energon Config Tests
+# =============================================================================
+
+
+def _patch_energon_deps(monkeypatch):
+    """Monkeypatch AutoBridge and HF tokenizer/processor for energon config tests."""
+    monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge)
+    monkeypatch.setattr(
+        _qwen3_vl_module,
+        "AutoTokenizer",
+        type(
+            "FakeAutoTokenizer",
+            (),
+            {
+                "from_pretrained": staticmethod(lambda *a, **kw: None),
+            },
+        ),
+    )
+    monkeypatch.setattr(
+        _qwen3_vl_module,
+        "Qwen3VLProcessor",
+        type(
+            "FakeProcessor",
+            (),
+            {
+                "from_pretrained": staticmethod(lambda *a, **kw: None),
+            },
+        ),
+    )
+
+
+def test_qwen3_vl_8b_peft_energon_builds_config(monkeypatch: pytest.MonkeyPatch):
+    """Test that the energon PEFT config builds a valid ConfigContainer."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
+
+    _assert_basic_config(cfg)
+    assert cfg.peft is not None
+
+
+def test_qwen3_vl_8b_peft_energon_uses_energon_provider(monkeypatch: pytest.MonkeyPatch):
+    """Test that the energon config uses EnergonProvider as dataset."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
+
+    from megatron.bridge.data.energon.energon_provider import EnergonProvider
+
+    assert isinstance(cfg.dataset, EnergonProvider)
+
+
+def test_qwen3_vl_8b_peft_energon_dataset_params(monkeypatch: pytest.MonkeyPatch):
+    """Test that the energon dataset has correct seq_length, batch sizes."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
+
+    assert cfg.dataset.seq_length == 4096
+    assert cfg.dataset.micro_batch_size == cfg.train.micro_batch_size
+    assert cfg.dataset.global_batch_size == cfg.train.global_batch_size
+
+
+@pytest.mark.parametrize("peft_scheme", ["lora", "dora"])
+def test_qwen3_vl_8b_peft_energon_schemes(peft_scheme: str, monkeypatch: pytest.MonkeyPatch):
+    """Test that lora and dora schemes work with energon config."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config(peft_scheme=peft_scheme)
+
+    _assert_basic_config(cfg)
+    assert cfg.peft is not None
+    assert hasattr(cfg.peft, "dim")
+    assert hasattr(cfg.peft, "alpha")
+
+
+def test_qwen3_vl_8b_peft_energon_parallelism(monkeypatch: pytest.MonkeyPatch):
+    """Test that energon config inherits 8B PEFT parallelism (TP=1, PP=1)."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
+
+    assert cfg.model.tensor_model_parallel_size == 1
+    assert cfg.model.pipeline_model_parallel_size == 1
+    assert cfg.model.expert_model_parallel_size == 1
+
+
+def test_qwen3_vl_8b_peft_energon_precision(monkeypatch: pytest.MonkeyPatch):
+    """Test that energon config uses bf16_mixed precision."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
+
+    assert cfg.mixed_precision == "bf16_mixed"
+
+
+def test_qwen3_vl_8b_peft_energon_freeze_defaults(monkeypatch: pytest.MonkeyPatch):
+    """Test that energon PEFT config has freeze options set to False."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
+
+    assert cfg.model.freeze_language_model is False
+    assert cfg.model.freeze_vision_model is False
+    assert cfg.model.freeze_vision_projection is False
+
+
+def test_qwen3_vl_8b_peft_energon_task_encoder(monkeypatch: pytest.MonkeyPatch):
+    """Test that energon config creates a QwenVLTaskEncoder in the dataset."""
+    _patch_energon_deps(monkeypatch)
+
+    cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config()
+
+    from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import QwenVLTaskEncoder
+
+    assert isinstance(cfg.dataset.task_encoder, QwenVLTaskEncoder)