diff --git a/examples/models/vlm/qwen3_vl/README.md b/examples/models/vlm/qwen3_vl/README.md index f62008f601..470b0d78f8 100644 --- a/examples/models/vlm/qwen3_vl/README.md +++ b/examples/models/vlm/qwen3_vl/README.md @@ -103,13 +103,15 @@ Before training, ensure the following environment variables are set: ### Supervised Fine-Tuning (SFT) -See the [sft.sh](sft.sh) script for full parameter fine-tuning with configurable model parallelisms. +See the [sft_unpacked.sh](sft_seq_unpacked.sh) script for full parameter fine-tuning with configurable model parallelisms, with unpacked sequences.. +See the [sft_packed.sh](sft_seq_packed.sh) script for for full parameter fine-tuning with sequence-packing. W&B report coming soon. ### Parameter-Efficient Fine-Tuning (PEFT) with LoRA -See the [peft.sh](peft.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism. +See the [peft_unpacked.sh](peft_seq_unpacked.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism, with unpacked sequences. +See the [peft_packed.sh](peft_seq_packed.sh) script for LoRA fine-tuning with sequence-packing. W&B report coming soon. diff --git a/examples/models/vlm/qwen3_vl/peft_seq_packed.sh b/examples/models/vlm/qwen3_vl/peft_seq_packed.sh new file mode 100755 index 0000000000..de116903e1 --- /dev/null +++ b/examples/models/vlm/qwen3_vl/peft_seq_packed.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +# Before training, make sure to set WANDB_API_KEY or disable wandb logging +# export WANDB_API_KEY= +# export WANDB_MODE=disabled + +# Test Seq Packing configurations for LoRA finetuning on the dense model +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct +MODEL_NAME=qwen3_vl_8b +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=50 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=2 +EVAL_ITERS=10 +LR=0.00005 +MIN_LR=0.000005 +LR_WARMUP_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +SEQ_PACKING_CONFIGS=(True False) + +# EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations +PARALLELISM_CONFIGS=("1,1,1,1" "1,1,1,2" "1,1,1,4") + +for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do + for par_config in "${PARALLELISM_CONFIGS[@]}"; do + IFS=',' read -r EP TP PP CP <<< "$par_config" + echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_finetune_config \ + --step_func qwen3_vl_step \ + --peft_scheme lora \ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + optimizer.lr=$LR \ + optimizer.min_lr=$MIN_LR \ + scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_seq_pack_${pack_config}_cp${CP} \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_seq_pack_${pack_config}_cp${CP} \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH \ + dataset.pack_sequences_in_batch=$pack_config \ + model.expert_model_parallel_size=$EP \ + model.tensor_model_parallel_size=$TP \ + model.pipeline_model_parallel_size=$PP \ + model.context_parallel_size=$CP \ + model.calculate_per_token_loss=True \ + ddp.average_in_collective=False \ + ddp.grad_reduce_in_fp32=True + done +done + + +# Test Seq Packing configurations for LoRA finetuning on the MoE model +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct +MODEL_NAME=qwen3_vl_30b_a3b +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=50 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=2 +EVAL_ITERS=10 +LR=0.00005 +MIN_LR=0.000005 +LR_WARMUP_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +SEQ_PACKING_CONFIGS=(True False) + +# EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations +PARALLELISM_CONFIGS=("8,1,1,1" "4,1,1,2" "2,1,1,4") + +for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do + for par_config in "${PARALLELISM_CONFIGS[@]}"; do + IFS=',' read -r EP TP PP CP <<< "$par_config" + echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_finetune_config \ + --step_func qwen3_vl_step \ + --peft_scheme lora \ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + optimizer.lr=$LR \ + optimizer.min_lr=$MIN_LR \ + scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_seq_pack_${pack_config}_ep${EP}_cp${CP} \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_seq_pack_${pack_config}_ep${EP}_cp${CP} \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH \ + dataset.pack_sequences_in_batch=$pack_config \ + model.expert_model_parallel_size=$EP \ + model.tensor_model_parallel_size=$TP \ + model.pipeline_model_parallel_size=$PP \ + model.context_parallel_size=$CP \ + model.calculate_per_token_loss=True \ + ddp.average_in_collective=False \ + ddp.grad_reduce_in_fp32=True + done +done diff --git a/examples/models/vlm/qwen3_vl/peft.sh b/examples/models/vlm/qwen3_vl/peft_seq_unpacked.sh similarity index 97% rename from examples/models/vlm/qwen3_vl/peft.sh rename to examples/models/vlm/qwen3_vl/peft_seq_unpacked.sh index 6cddb470a0..06c4518935 100644 --- a/examples/models/vlm/qwen3_vl/peft.sh +++ b/examples/models/vlm/qwen3_vl/peft_seq_unpacked.sh @@ -44,7 +44,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do echo "Running LoRA finetuning with TP=$TP, PP=$PP" uv run python -m torch.distributed.run --nproc_per_node=2 scripts/training/run_recipe.py \ --recipe ${MODEL_NAME}_finetune_config \ - --step_func vlm_step \ + --step_func qwen3_vl_step \ --peft_scheme lora \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ @@ -82,7 +82,7 @@ LOG_INTERVAL=1 WANDB_PROJECT=megatron-bridge-${DATASET_NAME} # EP/TP/PP combinations: "EP,TP,PP" configurations -PARALLELISM_CONFIGS=("8,1,1" "1,4,2") +PARALLELISM_CONFIGS=("8,1,1" "4,1,1" "2,1,1") for config in "${PARALLELISM_CONFIGS[@]}"; do IFS=',' read -r EP TP PP <<< "$config" @@ -90,7 +90,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do echo "Running LoRA finetuning with EP=$EP, TP=$TP, PP=$PP" uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ --recipe ${MODEL_NAME}_finetune_config \ - --step_func vlm_step \ + --step_func qwen3_vl_step \ --peft_scheme lora \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ @@ -111,4 +111,3 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do model.tensor_model_parallel_size=$TP \ model.pipeline_model_parallel_size=$PP done - diff --git a/examples/models/vlm/qwen3_vl/sft_seq_packed.sh b/examples/models/vlm/qwen3_vl/sft_seq_packed.sh new file mode 100755 index 0000000000..73ea044466 --- /dev/null +++ b/examples/models/vlm/qwen3_vl/sft_seq_packed.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +# Before training, make sure to set WANDB_API_KEY or disable wandb logging +# export WANDB_API_KEY= +# export WANDB_MODE=disabled + +# Test Seq Packing configurations for full finetuning on the dense model +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct +MODEL_NAME=qwen3_vl_8b +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=50 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=2 +EVAL_ITERS=10 +LR=0.00005 +MIN_LR=0.000005 +LR_WARMUP_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +SEQ_PACKING_CONFIGS=(True False) + +# EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations +PARALLELISM_CONFIGS=("1,1,1,1" "1,1,1,2" "1,1,1,4") + +for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do + for par_config in "${PARALLELISM_CONFIGS[@]}"; do + IFS=',' read -r EP TP PP CP <<< "$par_config" + echo "Running full finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_finetune_config \ + --step_func qwen3_vl_step \ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + optimizer.lr=$LR \ + optimizer.min_lr=$MIN_LR \ + scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_cp${CP} \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_cp${CP} \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH \ + dataset.pack_sequences_in_batch=$pack_config \ + model.expert_model_parallel_size=$EP \ + model.tensor_model_parallel_size=$TP \ + model.pipeline_model_parallel_size=$PP \ + model.context_parallel_size=$CP \ + model.calculate_per_token_loss=True \ + ddp.average_in_collective=False \ + ddp.grad_reduce_in_fp32=True + done +done + + +# Test Seq Packing configurations for full finetuning on the MoE model +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-30B-A3B-Instruct +MODEL_NAME=qwen3_vl_30b_a3b +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=50 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=2 +EVAL_ITERS=10 +LR=0.00005 +MIN_LR=0.000005 +LR_WARMUP_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +SEQ_PACKING_CONFIGS=(True False) + +# EP/TP/PP/CP combinations: "EP,TP,PP,CP" configurations +PARALLELISM_CONFIGS=("8,1,1,1" "4,1,1,2" "2,1,1,4") + +for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do + for par_config in "${PARALLELISM_CONFIGS[@]}"; do + IFS=',' read -r EP TP PP CP <<< "$par_config" + echo "Running full finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP" + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_finetune_config \ + --step_func qwen3_vl_step \ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + optimizer.lr=$LR \ + optimizer.min_lr=$MIN_LR \ + scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_seq_pack_${pack_config}_ep${EP}_cp${CP} \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_seq_pack_${pack_config}_ep${EP}_cp${CP} \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH \ + dataset.pack_sequences_in_batch=$pack_config \ + model.expert_model_parallel_size=$EP \ + model.tensor_model_parallel_size=$TP \ + model.pipeline_model_parallel_size=$PP \ + model.context_parallel_size=$CP \ + model.calculate_per_token_loss=True \ + ddp.average_in_collective=False \ + ddp.grad_reduce_in_fp32=True + done +done diff --git a/examples/models/vlm/qwen3_vl/sft.sh b/examples/models/vlm/qwen3_vl/sft_seq_unpacked.sh similarity index 98% rename from examples/models/vlm/qwen3_vl/sft.sh rename to examples/models/vlm/qwen3_vl/sft_seq_unpacked.sh index 0a26786273..510e365d86 100755 --- a/examples/models/vlm/qwen3_vl/sft.sh +++ b/examples/models/vlm/qwen3_vl/sft_seq_unpacked.sh @@ -44,7 +44,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do echo "Running full finetuning with TP=$TP, PP=$PP" uv run python -m torch.distributed.run --nproc_per_node=2 scripts/training/run_recipe.py \ --recipe ${MODEL_NAME}_finetune_config \ - --step_func vlm_step \ + --step_func qwen3_vl_step \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ train.train_iters=$TRAIN_ITERS \ @@ -89,7 +89,7 @@ for config in "${PARALLELISM_CONFIGS[@]}"; do echo "Running full finetuning with EP=$EP, TP=$TP, PP=$PP, SP=$SP" uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ --recipe ${MODEL_NAME}_finetune_config \ - --step_func vlm_step \ + --step_func qwen3_vl_step \ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ train.train_iters=$TRAIN_ITERS \ diff --git a/scripts/training/run_recipe.py b/scripts/training/run_recipe.py index 818121fd41..35a052073e 100755 --- a/scripts/training/run_recipe.py +++ b/scripts/training/run_recipe.py @@ -52,6 +52,7 @@ from typing import Callable import megatron.bridge.recipes as recipes +from megatron.bridge.models.qwen_vl.qwen3_vl_step import forward_step as qwen3_vl_forward_step from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.finetune import finetune from megatron.bridge.training.gpt_step import forward_step as gpt_forward_step @@ -64,6 +65,7 @@ STEP_FUNCTIONS: dict[str, Callable] = { "gpt_step": gpt_forward_step, "vlm_step": vlm_forward_step, + "qwen3_vl_step": qwen3_vl_forward_step, "llava_step": llava_forward_step, } diff --git a/src/megatron/bridge/models/qwen_vl/qwen3_vl_step.py b/src/megatron/bridge/models/qwen_vl/qwen3_vl_step.py index d949b5a112..7c9343438c 100644 --- a/src/megatron/bridge/models/qwen_vl/qwen3_vl_step.py +++ b/src/megatron/bridge/models/qwen_vl/qwen3_vl_step.py @@ -283,6 +283,8 @@ def forward_step( original_tokens.shape[0], original_tokens.shape[1], dtype=torch.bool, device=original_tokens.device ) forward_args["attention_mask"] = attention_mask + if forward_args["loss_mask"] is not None: + forward_args["loss_mask"] = forward_args["loss_mask"].reshape(1, -1) # qwen3vl need the original input_ids and position_ids # use split attention mask for calculate loss forward_args["packed_seq_params"] = packed_seq_params