diff --git a/examples/models/vlm/ministral3/README.md b/examples/models/vlm/ministral3/README.md index 578dd8908b..0c1cd0fcba 100644 --- a/examples/models/vlm/ministral3/README.md +++ b/examples/models/vlm/ministral3/README.md @@ -85,9 +85,12 @@ The current date is {today}. - See: [bridge.recipes.ministral3](../../apidocs/bridge/bridge.recipes.ministral3.md) - Available recipes: - - `ministral3_3b_finetune_config`: Finetuning for 3B VL model with PEFT support - - `ministral3_8b_finetune_config`: Finetuning for 8B VL model with PEFT support - - `ministral3_14b_finetune_config`: Finetuning for 14B VL model with PEFT support + - `ministral3_3b_sft_config`: Finetuning for 3B VL model + - `ministral3_8b_sft_config`: Finetuning for 8B VL model + - `ministral3_14b_sft_config`: Finetuning for 14B VL model + - `ministral3_3b_peft_config`: Finetuning for 3B VL model with PEFT support + - `ministral3_8b_peft_config`: Finetuning for 8B VL model with PEFT support + - `ministral3_14b_peft_config`: Finetuning for 14B VL model with PEFT support Before training, ensure the following environment variables are set: 1. `SAVE_DIR`: checkpoint and log saving directory @@ -101,15 +104,11 @@ Pretraining is not verified for this model. ### Supervised Fine-Tuning (SFT) -See the [sft.sh](sft.sh) script for full parameter fine-tuning with configurable model parallelisms. - -W&B report coming soon. +See the [sft_unpacked.sh](sft_unpacked.sh) script for full parameter fine-tuning with configurable model parallelisms. ### Parameter-Efficient Fine-Tuning (PEFT) with LoRA -See the [peft.sh](peft.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism. - -W&B report coming soon. +See the [peft_unpacked.sh](peft_unpacked.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism. ### Recommended Configurations @@ -124,6 +123,9 @@ W&B report coming soon. **Note:** LoRA/DoRA significantly reduces memory requirements, allowing for larger batch sizes and fewer GPUs. +### Expected Training Dynamics +We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/h32cflfn) for the expected loss curves and grad norms. + ## Evaluation Coming soon. diff --git a/examples/models/vlm/ministral3/peft.sh b/examples/models/vlm/ministral3/peft_unpacked.sh similarity index 92% rename from examples/models/vlm/ministral3/peft.sh rename to examples/models/vlm/ministral3/peft_unpacked.sh index b3c44a2f86..2f7be5b8a4 100755 --- a/examples/models/vlm/ministral3/peft.sh +++ b/examples/models/vlm/ministral3/peft_unpacked.sh @@ -29,12 +29,12 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16 MODEL_NAME=ministral3_3b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 -LR=0.0002 -MIN_LR=0.00002 +EVAL_ITERS=20 +LR=0.00005 +MIN_LR=0.000005 LR_WARMUP_ITERS=10 LOG_INTERVAL=1 WANDB_PROJECT=megatron-bridge-${DATASET_NAME} @@ -47,7 +47,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do echo "Running LoRA finetuning with TP=$TP, PP=$PP" uv run --no-sync python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + --recipe ${MODEL_NAME}_peft_config \ --step_func vlm_step \ --peft_scheme lora \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ @@ -55,7 +55,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ @@ -65,6 +65,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_tp${TP}_pp${PP} \ dataset.maker_name=make_${DATASET_NAME}_dataset \ dataset.seq_length=$SEQ_LENGTH \ + dataset.pack_sequences_in_batch=False \ model.tensor_model_parallel_size=$TP \ model.pipeline_model_parallel_size=$PP done diff --git a/examples/models/vlm/ministral3/sft.sh b/examples/models/vlm/ministral3/sft_unpacked.sh similarity index 92% rename from examples/models/vlm/ministral3/sft.sh rename to examples/models/vlm/ministral3/sft_unpacked.sh index a22eebbb03..431f7963e0 100755 --- a/examples/models/vlm/ministral3/sft.sh +++ b/examples/models/vlm/ministral3/sft_unpacked.sh @@ -29,12 +29,12 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16 MODEL_NAME=ministral3_3b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 -LR=0.00005 -MIN_LR=0.000005 +EVAL_ITERS=20 +LR=0.00001 +MIN_LR=0.000001 LR_WARMUP_ITERS=10 LOG_INTERVAL=1 WANDB_PROJECT=megatron-bridge-${DATASET_NAME} @@ -47,14 +47,14 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do echo "Running full finetuning with TP=$TP, PP=$PP" uv run --no-sync python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + --recipe ${MODEL_NAME}_sft_config \ --step_func vlm_step \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ @@ -64,6 +64,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_tp${TP}_pp${PP} \ dataset.maker_name=make_${DATASET_NAME}_dataset \ dataset.seq_length=$SEQ_LENGTH \ + dataset.pack_sequences_in_batch=False \ model.tensor_model_parallel_size=$TP \ model.pipeline_model_parallel_size=$PP done diff --git a/examples/models/vlm/qwen3_vl/README.md b/examples/models/vlm/qwen3_vl/README.md index da9da325a1..c0fb5fc9b7 100644 --- a/examples/models/vlm/qwen3_vl/README.md +++ b/examples/models/vlm/qwen3_vl/README.md @@ -106,15 +106,11 @@ Before training, ensure the following environment variables are set: See the [sft_unpacked.sh](sft_unpacked.sh) script for full parameter fine-tuning with configurable model parallelisms, with unpacked sequences. See the [sft.sh](sft.sh) script for full parameter fine-tuning with sequence-packing. -W&B report coming soon. - ### Parameter-Efficient Fine-Tuning (PEFT) with LoRA See the [peft_unpacked.sh](peft_unpacked.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism, with unpacked sequences. See the [peft.sh](peft.sh) script for LoRA fine-tuning with sequence-packing. -W&B report coming soon. - **Note:** LoRA/DoRA significantly reduces memory requirements, allowing for larger batch sizes and fewer GPUs. ## Finetuning with Energon Dataset @@ -129,7 +125,11 @@ field_map: conversation: json ``` -Then, update the dataset path (`dataset.path=/path/to/energon/dataset`) in [sft_energon.sh](sft_energon.sh) and run the script. +Then, update the dataset path (`dataset.path=/path/to/energon/dataset`) in [peft_energon.sh](peft_energon.sh) and run the script. + + +### Expected Training Dynamics +We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/lczz4ixx) for the expected loss curves and grad norms. ## Evaluation diff --git a/examples/models/vlm/qwen3_vl/peft.sh b/examples/models/vlm/qwen3_vl/peft.sh index de116903e1..80ce6226cd 100755 --- a/examples/models/vlm/qwen3_vl/peft.sh +++ b/examples/models/vlm/qwen3_vl/peft.sh @@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct MODEL_NAME=qwen3_vl_8b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 MICRO_BATCH_SIZE=2 -EVAL_ITERS=10 +EVAL_ITERS=20 +EVAL_INTERVAL=20 LR=0.00005 MIN_LR=0.000005 LR_WARMUP_ITERS=10 @@ -45,7 +46,7 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do IFS=',' read -r EP TP PP CP <<< "$par_config" echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + --recipe ${MODEL_NAME}_peft_config \ --step_func qwen3_vl_step \ --peft_scheme lora \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ @@ -53,7 +54,8 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ + validation.eval_interval=$EVAL_INTERVAL \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ @@ -80,10 +82,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct MODEL_NAME=qwen3_vl_30b_a3b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 MICRO_BATCH_SIZE=2 -EVAL_ITERS=10 +EVAL_ITERS=20 +EVAL_INTERVAL=20 LR=0.00005 MIN_LR=0.000005 LR_WARMUP_ITERS=10 @@ -100,7 +103,7 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do IFS=',' read -r EP TP PP CP <<< "$par_config" echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + --recipe ${MODEL_NAME}_peft_config \ --step_func qwen3_vl_step \ --peft_scheme lora \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ @@ -108,7 +111,8 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ + validation.eval_interval=$EVAL_INTERVAL \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ diff --git a/examples/models/vlm/qwen3_vl/sft_energon.sh b/examples/models/vlm/qwen3_vl/peft_energon.sh similarity index 90% rename from examples/models/vlm/qwen3_vl/sft_energon.sh rename to examples/models/vlm/qwen3_vl/peft_energon.sh index 10188ac86d..dd43fb0242 100755 --- a/examples/models/vlm/qwen3_vl/sft_energon.sh +++ b/examples/models/vlm/qwen3_vl/peft_energon.sh @@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct MODEL_NAME=qwen3_vl_8b DATASET_NAME=energon SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 MICRO_BATCH_SIZE=2 -EVAL_ITERS=10 +EVAL_ITERS=20 +EVAL_INTERVAL=20 LR=0.00005 MIN_LR=0.000005 LR_WARMUP_ITERS=10 @@ -47,20 +48,20 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do IFS=',' read -r EP TP PP CP N_PROC <<< "$par_config" echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP N_PROC=$N_PROC" uv run python -m torch.distributed.run --nproc_per_node=$N_PROC scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + --recipe ${MODEL_NAME}_peft_energon_config \ --step_func qwen3_vl_step \ --peft_scheme lora \ - --dataset_type energon \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ + validation.eval_interval=$EVAL_INTERVAL \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ - checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_seq_pack_${pack_config}_cp${CP} \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_energon_lora_seq_pack_${pack_config}_cp${CP} \ logger.log_interval=$LOG_INTERVAL \ logger.wandb_project=$WANDB_PROJECT \ logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_seq_pack_${pack_config}_cp${CP} \ diff --git a/examples/models/vlm/qwen3_vl/peft_unpacked.sh b/examples/models/vlm/qwen3_vl/peft_unpacked.sh index 06c4518935..90419918fa 100755 --- a/examples/models/vlm/qwen3_vl/peft_unpacked.sh +++ b/examples/models/vlm/qwen3_vl/peft_unpacked.sh @@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct MODEL_NAME=qwen3_vl_8b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 -MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 +MICRO_BATCH_SIZE=2 +EVAL_ITERS=20 +EVAL_INTERVAL=20 LR=0.00005 MIN_LR=0.000005 LR_WARMUP_ITERS=10 @@ -36,14 +37,14 @@ LOG_INTERVAL=1 WANDB_PROJECT=megatron-bridge-${DATASET_NAME} # TP/PP combinations: "TP,PP" -PARALLELISM_CONFIGS=("2,1" "1,2") +PARALLELISM_CONFIGS=("4,1" "2,1") for config in "${PARALLELISM_CONFIGS[@]}"; do IFS=',' read -r TP PP <<< "$config" echo "Running LoRA finetuning with TP=$TP, PP=$PP" - uv run python -m torch.distributed.run --nproc_per_node=2 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_peft_config \ --step_func qwen3_vl_step \ --peft_scheme lora \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ @@ -51,7 +52,8 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ + validation.eval_interval=$EVAL_INTERVAL \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ @@ -71,10 +73,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct MODEL_NAME=qwen3_vl_30b_a3b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 -MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 +MICRO_BATCH_SIZE=2 +EVAL_ITERS=20 +EVAL_INTERVAL=20 LR=0.00005 MIN_LR=0.000005 LR_WARMUP_ITERS=10 @@ -82,14 +85,14 @@ LOG_INTERVAL=1 WANDB_PROJECT=megatron-bridge-${DATASET_NAME} # EP/TP/PP combinations: "EP,TP,PP" configurations -PARALLELISM_CONFIGS=("8,1,1" "4,1,1" "2,1,1") +PARALLELISM_CONFIGS=("8,1,1" "4,1,1") for config in "${PARALLELISM_CONFIGS[@]}"; do IFS=',' read -r EP TP PP <<< "$config" echo "Running LoRA finetuning with EP=$EP, TP=$TP, PP=$PP" uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + --recipe ${MODEL_NAME}_peft_config \ --step_func qwen3_vl_step \ --peft_scheme lora \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ @@ -97,7 +100,8 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ + validation.eval_interval=$EVAL_INTERVAL \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ diff --git a/examples/models/vlm/qwen3_vl/sft.sh b/examples/models/vlm/qwen3_vl/sft.sh index 73ea044466..145be3d458 100755 --- a/examples/models/vlm/qwen3_vl/sft.sh +++ b/examples/models/vlm/qwen3_vl/sft.sh @@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct MODEL_NAME=qwen3_vl_8b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 MICRO_BATCH_SIZE=2 -EVAL_ITERS=10 +EVAL_ITERS=20 +EVAL_INTERVAL=20 LR=0.00005 MIN_LR=0.000005 LR_WARMUP_ITERS=10 @@ -38,28 +39,29 @@ WANDB_PROJECT=megatron-bridge-${DATASET_NAME} SEQ_PACKING_CONFIGS=(True False) # EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations -PARALLELISM_CONFIGS=("1,1,1,1" "1,1,1,2" "1,1,1,4") +PARALLELISM_CONFIGS=("1,2,1,1" "1,2,1,2" "1,2,1,4") for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do for par_config in "${PARALLELISM_CONFIGS[@]}"; do IFS=',' read -r EP TP PP CP <<< "$par_config" echo "Running full finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + --recipe ${MODEL_NAME}_sft_config \ --step_func qwen3_vl_step \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ + validation.eval_interval=$EVAL_INTERVAL \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ - checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_cp${CP} \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_tp${TP}_cp${CP} \ logger.log_interval=$LOG_INTERVAL \ logger.wandb_project=$WANDB_PROJECT \ - logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_cp${CP} \ + logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_tp${TP}_cp${CP} \ dataset.maker_name=make_${DATASET_NAME}_dataset \ dataset.seq_length=$SEQ_LENGTH \ dataset.pack_sequences_in_batch=$pack_config \ @@ -73,56 +75,3 @@ for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do done done - -# Test Seq Packing configurations for full finetuning on the MoE model -PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct -MODEL_NAME=qwen3_vl_30b_a3b -DATASET_NAME=cord_v2 -SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 -MICRO_BATCH_SIZE=2 -EVAL_ITERS=10 -LR=0.00005 -MIN_LR=0.000005 -LR_WARMUP_ITERS=10 -LOG_INTERVAL=1 -WANDB_PROJECT=megatron-bridge-${DATASET_NAME} - -SEQ_PACKING_CONFIGS=(True False) - -# EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations -PARALLELISM_CONFIGS=("8,1,1,1" "4,1,1,2" "2,1,1,4") - -for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do - for par_config in "${PARALLELISM_CONFIGS[@]}"; do - IFS=',' read -r EP TP PP CP <<< "$par_config" - echo "Running full finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" - uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ - --step_func qwen3_vl_step \ - checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ - model.seq_length=$SEQ_LENGTH \ - train.train_iters=$TRAIN_ITERS \ - train.global_batch_size=$GLOBAL_BATCH_SIZE \ - train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ - optimizer.lr=$LR \ - optimizer.min_lr=$MIN_LR \ - scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ - checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_ep${EP}_cp${CP} \ - logger.log_interval=$LOG_INTERVAL \ - logger.wandb_project=$WANDB_PROJECT \ - logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_ep${EP}_cp${CP} \ - dataset.maker_name=make_${DATASET_NAME}_dataset \ - dataset.seq_length=$SEQ_LENGTH \ - dataset.pack_sequences_in_batch=$pack_config \ - model.expert_model_parallel_size=$EP \ - model.tensor_model_parallel_size=$TP \ - model.pipeline_model_parallel_size=$PP \ - model.context_parallel_size=$CP \ - model.calculate_per_token_loss=True \ - ddp.average_in_collective=False \ - ddp.grad_reduce_in_fp32=True - done -done diff --git a/examples/models/vlm/qwen3_vl/sft_unpacked.sh b/examples/models/vlm/qwen3_vl/sft_unpacked.sh index 510e365d86..b3720dbec5 100755 --- a/examples/models/vlm/qwen3_vl/sft_unpacked.sh +++ b/examples/models/vlm/qwen3_vl/sft_unpacked.sh @@ -25,10 +25,11 @@ PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct MODEL_NAME=qwen3_vl_8b DATASET_NAME=cord_v2 SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 -MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 +TRAIN_ITERS=100 +GLOBAL_BATCH_SIZE=16 +MICRO_BATCH_SIZE=2 +EVAL_ITERS=20 +EVAL_INTERVAL=20 LR=0.00005 MIN_LR=0.000005 LR_WARMUP_ITERS=10 @@ -36,21 +37,22 @@ LOG_INTERVAL=1 WANDB_PROJECT=megatron-bridge-${DATASET_NAME} # TP/PP combinations: "TP,PP" -PARALLELISM_CONFIGS=("2,1" "1,2") +PARALLELISM_CONFIGS=("4,1" "2,1") for config in "${PARALLELISM_CONFIGS[@]}"; do IFS=',' read -r TP PP <<< "$config" echo "Running full finetuning with TP=$TP, PP=$PP" - uv run python -m torch.distributed.run --nproc_per_node=2 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_sft_config \ --step_func qwen3_vl_step \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ + validation.eval_iters=$EVAL_ITERS \ + validation.eval_interval=$EVAL_INTERVAL \ optimizer.lr=$LR \ optimizer.min_lr=$MIN_LR \ scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ @@ -64,49 +66,3 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do model.pipeline_model_parallel_size=$PP done - -# Common configurations for MoE model finetuning -PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct -MODEL_NAME=qwen3_vl_30b_a3b -DATASET_NAME=cord_v2 -SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 -MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 -LR=0.00005 -MIN_LR=0.000005 -LR_WARMUP_ITERS=10 -LOG_INTERVAL=1 -WANDB_PROJECT=megatron-bridge-${DATASET_NAME} - -# EP/TP/PP/SP combinations: "EP,TP,PP,SP" configurations -PARALLELISM_CONFIGS=("8,1,1,False" "1,4,2,False" "2,2,2,True") - -for config in "${PARALLELISM_CONFIGS[@]}"; do - IFS=',' read -r EP TP PP SP <<< "$config" - - echo "Running full finetuning with EP=$EP, TP=$TP, PP=$PP, SP=$SP" - uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ - --recipe ${MODEL_NAME}_finetune_config \ - --step_func qwen3_vl_step \ - checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ - model.seq_length=$SEQ_LENGTH \ - train.train_iters=$TRAIN_ITERS \ - train.global_batch_size=$GLOBAL_BATCH_SIZE \ - train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ - optimizer.lr=$LR \ - optimizer.min_lr=$MIN_LR \ - scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ - checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_ep${EP}_tp${TP}_pp${PP}_sp_${SP} \ - logger.log_interval=$LOG_INTERVAL \ - logger.wandb_project=$WANDB_PROJECT \ - logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_ep${EP}_tp${TP}_pp${PP}_sp_${SP} \ - dataset.maker_name=make_${DATASET_NAME}_dataset \ - dataset.seq_length=$SEQ_LENGTH \ - model.expert_model_parallel_size=$EP \ - model.tensor_model_parallel_size=$TP \ - model.pipeline_model_parallel_size=$PP \ - model.sequence_parallel=$SP -done diff --git a/scripts/training/run_recipe.py b/scripts/training/run_recipe.py index e927143f35..3c52aad5f4 100755 --- a/scripts/training/run_recipe.py +++ b/scripts/training/run_recipe.py @@ -133,12 +133,6 @@ def parse_args() -> tuple[argparse.Namespace, list[str]]: default=None, help="Sequence length for training", ) - parser.add_argument( - "--dataset_type", - type=str, - default=None, - help="Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded').", - ) parser.add_argument( "--hf_path", type=str, @@ -155,7 +149,6 @@ def load_recipe( peft_scheme: str | None, packed_sequence: bool = False, seq_length: int | None = None, - dataset_type: str | None = None, hf_path: str | None = None, ) -> ConfigContainer: """ @@ -166,7 +159,6 @@ def load_recipe( peft_scheme: PEFT scheme to use ('lora', 'dora', or None) packed_sequence: Enable packed sequence training (default: False) seq_length: Sequence length for training (optional) - dataset_type: Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded') hf_path: HuggingFace model ID or local path to model directory (optional) Returns: @@ -193,14 +185,12 @@ def load_recipe( accepts_peft = "peft" in params or has_var_keyword accepts_packed_sequence = "packed_sequence" in params or has_var_keyword accepts_seq_length = "seq_length" in params or has_var_keyword - accepts_dataset_type = "dataset_type" in params or has_var_keyword accepts_hf_path = "hf_path" in params or has_var_keyword except (ValueError, TypeError): # If signature inspection fails, fallback conservatively accepts_peft = True # peft is widely supported, try passing it accepts_packed_sequence = False # new parameter, don't pass if unsure accepts_seq_length = False # new parameter, don't pass if unsure - accepts_dataset_type = False # VLM-specific, don't pass if unsure accepts_hf_path = False # model-specific, don't pass if unsure # Build kwargs dynamically based on what the recipe accepts @@ -211,8 +201,6 @@ def load_recipe( kwargs["packed_sequence"] = packed_sequence if accepts_seq_length and seq_length is not None: kwargs["seq_length"] = seq_length - if accepts_dataset_type and dataset_type is not None: - kwargs["dataset_type"] = dataset_type if accepts_hf_path and hf_path is not None: kwargs["hf_path"] = hf_path @@ -250,7 +238,6 @@ def main() -> None: args.peft_scheme, args.packed_sequence, args.seq_length, - args.dataset_type, args.hf_path, ) diff --git a/src/megatron/bridge/data/energon/energon_provider.py b/src/megatron/bridge/data/energon/energon_provider.py index ff7cbdd22b..f33ea48dc1 100644 --- a/src/megatron/bridge/data/energon/energon_provider.py +++ b/src/megatron/bridge/data/energon/energon_provider.py @@ -37,6 +37,7 @@ class EnergonProvider(DatasetProvider): pack_sequences_in_batch: bool = False def build_datasets(self, context: DatasetBuildContext): + assert self.path, "EnergonProvider.path must be set. Use CLI override: dataset.path=" dataset = EnergonMultiModalDataModule( path=self.path, tokenizer=context.tokenizer if context.tokenizer is not None else self.tokenizer, diff --git a/src/megatron/bridge/recipes/qwen_vl/__init__.py b/src/megatron/bridge/recipes/qwen_vl/__init__.py index 35ef162de1..cf4f94c0d0 100644 --- a/src/megatron/bridge/recipes/qwen_vl/__init__.py +++ b/src/megatron/bridge/recipes/qwen_vl/__init__.py @@ -16,6 +16,7 @@ # Qwen3-VL models from .qwen3_vl import ( qwen3_vl_8b_peft_config, + qwen3_vl_8b_peft_energon_config, qwen3_vl_8b_sft_config, qwen3_vl_30b_a3b_peft_config, qwen3_vl_30b_a3b_sft_config, @@ -91,6 +92,7 @@ "qwen3_vl_235b_a22b_sft_config", # Qwen3-VL PEFT configs "qwen3_vl_8b_peft_config", + "qwen3_vl_8b_peft_energon_config", "qwen3_vl_30b_a3b_peft_config", "qwen3_vl_235b_a22b_peft_config", ] diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py index d6b6acda6d..4e26908322 100644 --- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py +++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py @@ -18,16 +18,42 @@ """ import torch +from transformers import AutoTokenizer, Qwen3VLProcessor from megatron.bridge import AutoBridge +from megatron.bridge.data.energon.energon_provider import EnergonProvider from megatron.bridge.peft.base import PEFT from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm +from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import QwenVLTaskEncoder from megatron.bridge.recipes.utils.finetune_utils import default_peft_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend +def _make_energon_dataset( + hf_path: str, seq_length: int, micro_batch_size: int, global_batch_size: int +) -> EnergonProvider: + """Create an EnergonProvider dataset config for Qwen3-VL recipes.""" + tokenizer = AutoTokenizer.from_pretrained(hf_path) + # Use Qwen3VLProcessor to match the HF flow (which uses AutoProcessor). + # This processor accepts both images and videos kwargs. + image_processor = Qwen3VLProcessor.from_pretrained(hf_path) + task_encoder = QwenVLTaskEncoder( + tokenizer=tokenizer, + image_processor=image_processor, + max_padding_length=seq_length, + ) + return EnergonProvider( + path="", # Must be set via CLI override: dataset.path= + seq_length=seq_length, + micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size, + num_workers=2, + task_encoder=task_encoder, + ) + + # ============================================================================= # Qwen3-VL 8B SFT Configuration # ============================================================================= @@ -42,7 +68,7 @@ def qwen3_vl_8b_sft_config() -> ConfigContainer: cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3-VL-8B" + hf_path = "Qwen/Qwen3-VL-8B-Instruct" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -177,7 +203,7 @@ def qwen3_vl_30b_a3b_sft_config() -> ConfigContainer: cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3-VL-30B-A3B" + hf_path = "Qwen/Qwen3-VL-30B-A3B-Instruct" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -196,7 +222,7 @@ def qwen3_vl_30b_a3b_sft_config() -> ConfigContainer: cfg.model.freeze_vision_projection = False # Token dispatcher settings (MoE) - cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_token_dispatcher_type = "alltoall" cfg.model.moe_flex_dispatcher_backend = None cfg.model.moe_hybridep_num_sms = 16 @@ -332,7 +358,7 @@ def qwen3_vl_235b_a22b_sft_config() -> ConfigContainer: cfg.model.freeze_vision_projection = False # Token dispatcher settings (MoE) - cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_token_dispatcher_type = "alltoall" cfg.model.moe_flex_dispatcher_backend = None cfg.model.moe_hybridep_num_sms = 16 @@ -458,7 +484,7 @@ def qwen3_vl_8b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3-VL-8B" + hf_path = "Qwen/Qwen3-VL-8B-Instruct" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -602,7 +628,7 @@ def qwen3_vl_30b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCont cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3-VL-30B-A3B" + hf_path = "Qwen/Qwen3-VL-30B-A3B-Instruct" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -621,7 +647,7 @@ def qwen3_vl_30b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCont cfg.model.freeze_vision_projection = False # Token dispatcher settings (MoE) - cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_token_dispatcher_type = "alltoall" cfg.model.moe_flex_dispatcher_backend = None cfg.model.moe_hybridep_num_sms = 16 @@ -766,7 +792,7 @@ def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCo cfg.model.freeze_vision_projection = False # Token dispatcher settings (MoE) - cfg.model.moe_token_dispatcher_type = None + cfg.model.moe_token_dispatcher_type = "alltoall" cfg.model.moe_flex_dispatcher_backend = None cfg.model.moe_hybridep_num_sms = 16 @@ -867,3 +893,18 @@ def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCo # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" return cfg + + +# ============================================================================= +# Qwen3-VL 8B PEFT with Energon Dataset +# ============================================================================= +def qwen3_vl_8b_peft_energon_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT (LoRA/DoRA) config for Qwen3-VL 8B with Energon dataset. + + Same as qwen3_vl_8b_peft_config but uses EnergonProvider instead of HF dataset. + Set the dataset path via CLI override: dataset.path=/path/to/energon/dataset + """ + cfg = qwen3_vl_8b_peft_config(peft_scheme=peft_scheme) + hf_path = "Qwen/Qwen3-VL-8B-Instruct" + cfg.dataset = _make_energon_dataset(hf_path, 4096, cfg.train.micro_batch_size, cfg.train.global_batch_size) + return cfg diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py index 0bf6698ec6..f1cac58d35 100644 --- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py +++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py @@ -444,3 +444,122 @@ def test_qwen3_vl_8b_is_dense_model(monkeypatch: pytest.MonkeyPatch): assert cfg.model.moe_router_fusion is False assert cfg.model.moe_permute_fusion is False assert cfg.model.moe_grouped_gemm is False + + +# ============================================================================= +# Qwen3-VL 8B PEFT Energon Config Tests +# ============================================================================= + + +def _patch_energon_deps(monkeypatch): + """Monkeypatch AutoBridge and HF tokenizer/processor for energon config tests.""" + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + monkeypatch.setattr( + _qwen3_vl_module, + "AutoTokenizer", + type( + "FakeAutoTokenizer", + (), + { + "from_pretrained": staticmethod(lambda *a, **kw: None), + }, + ), + ) + monkeypatch.setattr( + _qwen3_vl_module, + "Qwen3VLProcessor", + type( + "FakeProcessor", + (), + { + "from_pretrained": staticmethod(lambda *a, **kw: None), + }, + ), + ) + + +def test_qwen3_vl_8b_peft_energon_builds_config(monkeypatch: pytest.MonkeyPatch): + """Test that the energon PEFT config builds a valid ConfigContainer.""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config() + + _assert_basic_config(cfg) + assert cfg.peft is not None + + +def test_qwen3_vl_8b_peft_energon_uses_energon_provider(monkeypatch: pytest.MonkeyPatch): + """Test that the energon config uses EnergonProvider as dataset.""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config() + + from megatron.bridge.data.energon.energon_provider import EnergonProvider + + assert isinstance(cfg.dataset, EnergonProvider) + + +def test_qwen3_vl_8b_peft_energon_dataset_params(monkeypatch: pytest.MonkeyPatch): + """Test that the energon dataset has correct seq_length, batch sizes.""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config() + + assert cfg.dataset.seq_length == 4096 + assert cfg.dataset.micro_batch_size == cfg.train.micro_batch_size + assert cfg.dataset.global_batch_size == cfg.train.global_batch_size + + +@pytest.mark.parametrize("peft_scheme", ["lora", "dora"]) +def test_qwen3_vl_8b_peft_energon_schemes(peft_scheme: str, monkeypatch: pytest.MonkeyPatch): + """Test that lora and dora schemes work with energon config.""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config(peft_scheme=peft_scheme) + + _assert_basic_config(cfg) + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") + + +def test_qwen3_vl_8b_peft_energon_parallelism(monkeypatch: pytest.MonkeyPatch): + """Test that energon config inherits 8B PEFT parallelism (TP=1, PP=1).""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config() + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 1 + + +def test_qwen3_vl_8b_peft_energon_precision(monkeypatch: pytest.MonkeyPatch): + """Test that energon config uses bf16_mixed precision.""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config() + + assert cfg.mixed_precision == "bf16_mixed" + + +def test_qwen3_vl_8b_peft_energon_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that energon PEFT config has freeze options set to False.""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config() + + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_qwen3_vl_8b_peft_energon_task_encoder(monkeypatch: pytest.MonkeyPatch): + """Test that energon config creates a QwenVLTaskEncoder in the dataset.""" + _patch_energon_deps(monkeypatch) + + cfg = _qwen3_vl_module.qwen3_vl_8b_peft_energon_config() + + from megatron.bridge.recipes.qwen_vl.data.energon.task_encoder import QwenVLTaskEncoder + + assert isinstance(cfg.dataset.task_encoder, QwenVLTaskEncoder)