diff --git a/examples/conversion/hf_to_megatron_generate_audio_lm.py b/examples/conversion/hf_to_megatron_generate_audio_lm.py new file mode 100644 index 0000000000..c37852dfbe --- /dev/null +++ b/examples/conversion/hf_to_megatron_generate_audio_lm.py @@ -0,0 +1,450 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Audio-Language Model Generation Script for Qwen2-Audio. + +This script demonstrates how to use Qwen2-Audio models with Megatron-Bridge +for audio understanding tasks. + +Example: + # Audio-Language generation with audio from URL: + uv run python examples/conversion/hf_to_megatron_generate_alm.py \ + --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \ + --audio_url="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" \ + --prompt="What's that sound?" + + # Audio-Language generation with local audio file: + uv run python examples/conversion/hf_to_megatron_generate_alm.py \ + --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \ + --audio_path="/path/to/audio.wav" \ + --prompt="Describe what you hear in this audio." + + # Text-only generation (no audio): + uv run python examples/conversion/hf_to_megatron_generate_alm.py \ + --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \ + --prompt="Hello, how are you?" + + # Load from Megatron checkpoint: + uv run python examples/conversion/hf_to_megatron_generate_alm.py \ + --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \ + --megatron_model_path="/path/to/megatron/checkpoint" \ + --audio_url="https://example.com/audio.mp3" \ + --prompt="What's in this audio?" +""" + +import argparse +from io import BytesIO +from typing import Optional +from urllib.request import urlopen + +import torch +import torch.distributed as dist +from megatron.core import parallel_state +from megatron.core.pipeline_parallel.schedules import get_forward_backward_func +from transformers import AutoProcessor, AutoTokenizer + +from megatron.bridge import AutoBridge +from megatron.bridge.models.hf_pretrained.utils import is_safe_repo +from megatron.bridge.utils.common_utils import get_last_rank, print_rank_0 + + +# Try to import librosa for audio loading +try: + import librosa + + HAS_LIBROSA = True +except ImportError: + librosa = None + HAS_LIBROSA = False + + +class SingleBatchIterator: + """Iterator that yields a single batch of data for audio-language generation. + Required by the forward_backward_func function. + + This class creates an iterator that yields exactly one batch containing + input tokens, position IDs, attention mask, and optional audio inputs, + then raises StopIteration. Used for single-step inference in the forward pass. + """ + + def __init__( + self, + input_ids, + position_ids, + attention_mask, + input_features=None, + feature_attention_mask=None, + ): + self.batch = dict( + tokens=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + ) + + # Add audio inputs if provided + if input_features is not None: + self.batch["input_features"] = input_features + if feature_attention_mask is not None: + self.batch["feature_attention_mask"] = feature_attention_mask + + self._yielded = False + + def __iter__(self): + return self + + def __next__(self): + if self._yielded: + raise StopIteration + self._yielded = True + return self.batch + + +def alm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor: + """Forward step function for audio-language generation. + Required by the forward_backward_func function. + + Extracts a batch from the data iterator and runs the model forward pass + with the provided input tokens, position IDs, attention mask, and audio inputs. + + Args: + data_iterator: Iterator providing batches of input data + model: The Megatron model to run forward pass on + **kwargs: Additional keyword arguments (unused) + + Returns: + Tuple of (model_output, loss_function) + """ + batch = next(data_iterator) + forward_args = { + "input_ids": batch["tokens"], + "position_ids": batch["position_ids"], + "attention_mask": batch.get("attention_mask", None), + } + + # Add audio inputs if present + if "input_features" in batch: + forward_args["input_features"] = batch["input_features"] + if "feature_attention_mask" in batch: + forward_args["feature_attention_mask"] = batch["feature_attention_mask"] + + def loss_func(x, **kwargs): + return x + + model_output = model(**forward_args) + if isinstance(model_output, tuple): + output_tensor, _ = model_output + else: + output_tensor = model_output + + return output_tensor, loss_func + + +def load_audio(audio_path: str, sampling_rate: int = 16000): + """Load an audio file from URL or file path. + + Args: + audio_path: URL or local file path to the audio file + sampling_rate: Target sampling rate for the audio + + Returns: + Audio data as numpy array + """ + if not HAS_LIBROSA: + raise ImportError("librosa is required for audio loading. Please install it: pip install librosa") + + if audio_path.startswith(("http://", "https://")): + audio_data, _ = librosa.load(BytesIO(urlopen(audio_path).read()), sr=sampling_rate) + else: + audio_data, _ = librosa.load(audio_path, sr=sampling_rate) + + return audio_data + + +def process_audio_inputs(processor, audio_path: Optional[str], prompt: str): + """Process audio inputs for audio-language model. + + Args: + processor: AutoProcessor for the audio-language model + audio_path: Path or URL to the audio file (optional) + prompt: Text prompt + + Returns: + Tuple of (input_ids, input_features, feature_attention_mask, messages) + """ + if audio_path: + # Get sampling rate from processor + sampling_rate = processor.feature_extractor.sampling_rate + + # Load audio + audio_data = load_audio(audio_path, sampling_rate) + + # Create messages with audio and text for Qwen2-Audio format + messages = [ + { + "role": "user", + "content": [ + {"type": "audio", "audio_url": audio_path}, + {"type": "text", "text": prompt}, + ], + } + ] + + # Apply chat template + text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + + # Process inputs with audio + inputs = processor(text=text, audio=[audio_data], return_tensors="pt", padding=True) + + return ( + inputs.input_ids, + inputs.input_features, + getattr(inputs, "feature_attention_mask", None), + messages, + ) + else: + # Text-only processing + messages = [{"role": "user", "content": prompt}] + text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + inputs = processor(text=text, return_tensors="pt") + return inputs.input_ids, None, None, messages + + +def main(args) -> None: + """Main function for audio-language generation from HuggingFace models. + + Loads an audio-language model either from HuggingFace (with optional conversion to Megatron) + or directly from a Megatron checkpoint, then performs greedy generation + using the provided prompt and optional audio input. + + Args: + args: Parsed command line arguments containing model paths, prompt, + audio path, parallelism settings, and generation parameters + """ + tp = args.tp + pp = args.pp + ep = args.ep + etp = args.etp + + # Choose loading method based on arguments + if args.megatron_model_path: + # Load from Megatron checkpoint + print_rank_0(f"Loading Megatron model from: {args.megatron_model_path}") + + # We still need HF config for tokenizer, but we'll load the model from Megatron checkpoint + # Create bridge from HF config only (no weights) + bridge = AutoBridge.from_hf_pretrained(args.hf_model_path) + + # Initialize model parallel before loading + model_provider = bridge.to_megatron_provider(load_weights=False) + model_provider.tensor_model_parallel_size = tp + model_provider.pipeline_model_parallel_size = pp + model_provider.expert_model_parallel_size = ep + model_provider.expert_tensor_parallel_size = etp + model_provider.pipeline_dtype = torch.bfloat16 + model_provider.finalize() + model_provider.initialize_model_parallel(seed=0) + + # Load the Megatron model directly + model = bridge.load_megatron_model( + args.megatron_model_path, + mp_overrides={ + "tensor_model_parallel_size": tp, + "pipeline_model_parallel_size": pp, + "expert_model_parallel_size": ep, + "expert_tensor_parallel_size": etp, + "pipeline_dtype": torch.bfloat16, + }, + wrap_with_ddp=False, + ) + + else: + # Load from HuggingFace and convert to Megatron + print_rank_0(f"Loading HuggingFace model from: {args.hf_model_path}") + bridge = AutoBridge.from_hf_pretrained(args.hf_model_path) + model_provider = bridge.to_megatron_provider(load_weights=True) + model_provider.tensor_model_parallel_size = tp + model_provider.pipeline_model_parallel_size = pp + model_provider.expert_model_parallel_size = ep + model_provider.expert_tensor_parallel_size = etp + model_provider.pipeline_dtype = torch.bfloat16 + model_provider.finalize() + model_provider.initialize_model_parallel(seed=0) + model = model_provider.provide_distributed_model(wrap_with_ddp=False) + + model = [m.cuda() for m in model] + for m in model: + m.eval() + + # Set grad_scale_func to None on the model's config for inference + for m in model: + if hasattr(m, "config"): + m.config.grad_scale_func = None + + # Initialize tokenizer and processor + tokenizer = AutoTokenizer.from_pretrained( + args.hf_model_path, + trust_remote_code=is_safe_repo( + trust_remote_code=args.trust_remote_code, + hf_path=args.hf_model_path, + ), + ) + processor = AutoProcessor.from_pretrained( + args.hf_model_path, + trust_remote_code=is_safe_repo( + trust_remote_code=args.trust_remote_code, + hf_path=args.hf_model_path, + ), + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + # Determine audio path (URL or file) + audio_path = args.audio_url or args.audio_path + + # Process inputs (text and audio if provided) + prompt = args.prompt + input_ids, input_features, feature_attention_mask, messages = process_audio_inputs(processor, audio_path, prompt) + + # Move to GPU + input_ids = input_ids.cuda() + if input_features is not None: + input_features = input_features.cuda() + if feature_attention_mask is not None: + feature_attention_mask = feature_attention_mask.cuda() + + position_ids = ( + torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids) + ) + attention_mask = torch.ones_like(input_ids, dtype=torch.bool) + generated_ids = input_ids.clone() + + stop_tokens = [tokenizer.eos_token_id] + + # Greedy generation loop + for step in range(args.max_new_tokens): + with torch.no_grad(): + print_rank_0(f"Generation step {step}") + + fwd_bwd_function = get_forward_backward_func() + + # Keep passing audio inputs for all steps to ensure audio features are available + iterator = SingleBatchIterator( + input_ids, position_ids, attention_mask, input_features, feature_attention_mask + ) + + output = fwd_bwd_function( + forward_step_func=alm_forward_step, + data_iterator=iterator, + model=model, + num_microbatches=1, + forward_only=True, + seq_length=input_ids.size(1), + micro_batch_size=1, + collect_non_loss_data=True, + ) + if isinstance(output, list) and len(output) > 0: + output = output[0] + + if parallel_state.is_pipeline_last_stage(): + world_size = parallel_state.get_tensor_model_parallel_world_size() + gathered_tensors = [torch.zeros_like(output) for _ in range(world_size)] + # All-gather operation + dist.all_gather(gathered_tensors, output, group=parallel_state.get_tensor_model_parallel_group()) + # Concatenate along last dimension (dim=2) + output = torch.cat(gathered_tensors, dim=2) + next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True) + + # Debug: print token information + if step < 5: # Only for first few iterations + print_rank_0(f"Step {step}: output shape={output.shape}, var={output.var():.4f}") + logits = output[0, -1, :] + top5_vals, top5_ids = torch.topk(logits, 5) + top5_tokens = [tokenizer.decode([idx]) for idx in top5_ids] + print_rank_0(f"Top 5: {list(zip(top5_tokens, top5_vals.tolist()))}") + print_rank_0( + f"Selected: '{tokenizer.decode([next_token_ids.item()])}' (id={next_token_ids.item()})" + ) + else: + next_token_ids = torch.ones((1, 1), device=generated_ids.device, dtype=generated_ids.dtype) + + torch.distributed.broadcast(next_token_ids, get_last_rank()) + generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) + + input_ids = generated_ids + position_ids = ( + torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device) + .unsqueeze(0) + .expand_as(input_ids) + ) + attention_mask = torch.ones_like(input_ids, dtype=torch.bool) + + # If the generated token is the end of sequence token, stop generating + if next_token_ids.item() in stop_tokens: + break + + # Decode the generated sequence + generated_text = tokenizer.decode(list(generated_ids[0]), skip_special_tokens=True) + print_rank_0("======== GENERATED TEXT OUTPUT ========") + if audio_path: + print_rank_0(f"Audio: {audio_path}") + print_rank_0(f"Prompt: {prompt}") + print_rank_0(f"Generated: {generated_text}") + print_rank_0("=======================================") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Audio-Language Generation from HuggingFace Audio-Language Models") + parser.add_argument( + "--hf_model_path", + type=str, + required=True, + help="Path to the HuggingFace audio-language model.", + ) + parser.add_argument( + "--prompt", + type=str, + default="What's that sound?", + help="Input prompt for audio-language generation.", + ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=50, + help="Maximum number of new tokens to generate.", + ) + parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism size") + parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism size") + parser.add_argument("--ep", type=int, default=1, help="Expert parallelism size") + parser.add_argument("--etp", type=int, default=1, help="Expert tensor parallelism size") + parser.add_argument("--megatron_model_path", type=str, default=None, help="Path to the Megatron model checkpoint") + parser.add_argument( + "--audio_path", + type=str, + default=None, + help="Local path to the audio file for audio-language generation (optional).", + ) + parser.add_argument( + "--audio_url", + type=str, + default=None, + help="URL to the audio file for audio-language generation (optional).", + ) + parser.add_argument("--trust_remote_code", action="store_true", help="if trust_remote_code") + args = parser.parse_args() + + main(args) + + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py index 2400b55f48..6f00e23e26 100644 --- a/src/megatron/bridge/models/__init__.py +++ b/src/megatron/bridge/models/__init__.py @@ -121,6 +121,11 @@ Qwen25ModelProvider72B, Qwen25ModelProvider500M, ) +from megatron.bridge.models.qwen_audio import ( + Qwen2AudioBridge, + Qwen2AudioModel, + Qwen2AudioModelProvider, +) from megatron.bridge.models.qwen_omni import ( Qwen25OmniBridge, Qwen25OmniModel, @@ -224,6 +229,10 @@ "MimoBridge", # Nemotron Models "NemotronBridge", + # Audio-Language Models + "Qwen2AudioBridge", + "Qwen2AudioModel", + "Qwen2AudioModelProvider", # VL Models "Qwen25VLModel", "Qwen25VLBridge", diff --git a/src/megatron/bridge/models/qwen_audio/__init__.py b/src/megatron/bridge/models/qwen_audio/__init__.py new file mode 100644 index 0000000000..c7885b5594 --- /dev/null +++ b/src/megatron/bridge/models/qwen_audio/__init__.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Qwen2-Audio Model Bridge and Provider implementations. + +This module provides support for Qwen2-Audio audio-language models. + +Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct + +Supported models: +- Qwen2-Audio-7B +- Qwen2-Audio-7B-Instruct + +Example usage: + >>> from megatron.bridge import AutoBridge + >>> bridge = AutoBridge.from_hf_pretrained("Qwen/Qwen2-Audio-7B-Instruct") + >>> provider = bridge.to_megatron_provider() +""" + +from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel +from megatron.bridge.models.qwen_audio.qwen2_audio_bridge import Qwen2AudioBridge +from megatron.bridge.models.qwen_audio.qwen2_audio_provider import ( + Qwen2AudioModelProvider, +) + + +__all__ = [ + # Bridge + "Qwen2AudioBridge", + # Model + "Qwen2AudioModel", + # Model Providers + "Qwen2AudioModelProvider", +] diff --git a/src/megatron/bridge/models/qwen_audio/modeling_qwen2_audio.py b/src/megatron/bridge/models/qwen_audio/modeling_qwen2_audio.py new file mode 100644 index 0000000000..c87c70f78d --- /dev/null +++ b/src/megatron/bridge/models/qwen_audio/modeling_qwen2_audio.py @@ -0,0 +1,317 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Qwen2-Audio Model for Megatron. + +This module provides the Qwen2AudioModel class that combines: +- HuggingFace's audio encoder (audio_tower) for processing mel spectrograms +- HuggingFace's multimodal projector for audio-to-language projection +- Megatron's language model for text generation + +Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct +""" + +import types +from typing import TYPE_CHECKING, Optional + +import torch +from megatron.core.transformer.module import MegatronModule +from torch import Tensor + +from megatron.bridge.models.gpt_provider import GPTModelProvider +from megatron.bridge.utils.common_utils import hook_hf_module_setattr_for_tp_grad_sync + + +if TYPE_CHECKING: + from megatron.core.packed_seq_params import PackedSeqParams + + +# Import HuggingFace Qwen2Audio model classes with fallback +try: + from transformers import Qwen2AudioForConditionalGeneration + from transformers.models.qwen2_audio.modeling_qwen2_audio import ( + Qwen2AudioEncoder, + Qwen2AudioMultiModalProjector, + ) + + HAS_QWEN2_AUDIO = True +except ImportError: + Qwen2AudioForConditionalGeneration = None + Qwen2AudioEncoder = None + Qwen2AudioMultiModalProjector = None + HAS_QWEN2_AUDIO = False + + +class Qwen2AudioModel(MegatronModule): + """ + Qwen2-Audio Model wrapper for Megatron. + + This class combines HuggingFace's audio components with Megatron's language model: + - Audio tower (HF): Processes mel spectrograms through Whisper-like encoder + - Multimodal projector (HF): Projects audio features to language model space + - Language model (Megatron): Generates text conditioned on audio and text inputs + + The audio encoder forward pass uses HuggingFace implementation, + while the language model forward pass uses Megatron's optimized implementation. + + Args: + config (GPTModelProvider): Model provider containing configuration for language and audio modules. + pre_process (bool, optional): Whether to construct the audio tower and projector. Default: True. + post_process (bool, optional): Whether to apply post-processing. Default: True. + vp_stage (Optional[int], optional): Pipeline stage for model parallelism. Default: None. + + Attributes: + pre_process (bool): If True, enables audio and multimodal components. + post_process (bool): If True, enables post-processing. + vp_stage (Optional[int]): Pipeline stage for model parallelism. + audio_tower (nn.Module): Audio encoder from HuggingFace (Whisper-like). + multi_modal_projector (nn.Module): Projects audio features to language model space. + language_model (nn.Module): Megatron language model. + + Forward Inputs: + input_ids (torch.LongTensor, optional): Tokenized input ids for the language model. + attention_mask (torch.Tensor, optional): Attention mask for the language model. + position_ids (torch.LongTensor, optional): Position ids for the language model. + inputs_embeds (torch.FloatTensor, optional): Precomputed input embeddings. + input_features (torch.Tensor, optional): Mel spectrogram features for audio. + feature_attention_mask (torch.Tensor, optional): Attention mask for audio features. + labels (torch.Tensor, optional): Target labels for supervised training. + runtime_gather_output (bool, optional): If True, gather outputs across pipeline stages. + loss_mask (Tensor, optional): Mask for loss computation. + + Returns: + Tensor: Model output (e.g., logits or loss, depending on mode). + + Note: + - If `pre_process` is False, only the language model is constructed. + - The audio tower and projector are only active if `pre_process` is True. + - This class is intended for use within the Megatron-LM framework. + """ + + def __init__( + self, + config: GPTModelProvider, + pre_process: bool = True, + post_process: bool = True, + vp_stage: Optional[int] = None, + ) -> None: + super().__init__(config=config) + + self.pre_process = pre_process + self.post_process = post_process + self.vp_stage = vp_stage + + if pre_process: + if not HAS_QWEN2_AUDIO: + raise ImportError( + "Qwen2Audio model requires transformers with Qwen2Audio support. " + "Please upgrade: pip install 'transformers>=4.40.0'" + ) + + # Initialize audio tower from HuggingFace config + # The audio_tower is a Whisper-like encoder that processes mel spectrograms + self.audio_tower = Qwen2AudioEncoder(config.hf_config.audio_config) + + # Initialize multimodal projector from HuggingFace config + # Projects audio encoder output dimension to language model hidden size + self.multi_modal_projector = Qwen2AudioMultiModalProjector(config.hf_config) + + # Ensure HF audio tower params are marked for TP grad sync + hook_hf_module_setattr_for_tp_grad_sync(self.audio_tower) + hook_hf_module_setattr_for_tp_grad_sync(self.multi_modal_projector) + + # Initialize Megatron language model + self.language_model = self.config.provide_language_model( + pre_process=pre_process, post_process=post_process, vp_stage=vp_stage + ) + + # Finalize grad requires these to be bound with module + self.share_embeddings_and_output_weights = config.share_embeddings_and_output_weights + self.shared_embedding_or_output_weight = self.language_model.shared_embedding_or_output_weight + + # Monkey-patch methods from HuggingFace Qwen2AudioForConditionalGeneration + if HAS_QWEN2_AUDIO and Qwen2AudioForConditionalGeneration is not None: + self._merge_input_ids_with_audio_features = types.MethodType( + Qwen2AudioForConditionalGeneration._merge_input_ids_with_audio_features, self + ) + + # Store audio token id from config + self.audio_token_id = getattr(config, "audio_token_id", 151646) + self.pad_token_id = getattr(config.hf_config, "pad_token_id", -1) + + def set_input_tensor(self, input_tensor) -> None: + """Set model chunk input tensor.""" + self.language_model.set_input_tensor(input_tensor) + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + input_features: Optional[torch.Tensor] = None, + feature_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + runtime_gather_output: Optional[bool] = None, + packed_seq_params: Optional["PackedSeqParams"] = None, + *, + loss_mask: Optional[Tensor] = None, + ) -> Tensor: + """ + Forward pass combining HuggingFace audio encoder with Megatron language model. + + Args: + input_ids: Tokenized input ids for the language model. + attention_mask: Attention mask for the language model. + position_ids: Position ids for the language model. + inputs_embeds: Precomputed input embeddings. + input_features: Mel spectrogram features for audio input. + feature_attention_mask: Attention mask for audio features. + labels: Target labels for supervised training. + runtime_gather_output: If True, gather outputs across pipeline stages. + loss_mask: Mask for loss computation. + + Returns: + Tensor: Model output containing logits or loss. + """ + if self.pre_process: + if inputs_embeds is None: + # Get text embeddings from Megatron language model + inputs_embeds = self.language_model.embedding( + input_ids=input_ids, position_ids=None + ) # [seq_len, batch, hidden] + + # Transpose to HF format [batch, seq_len, hidden] + inputs_embeds = inputs_embeds.transpose(1, 0).contiguous() + + if input_features is not None and input_ids.shape[1] != 1: + # Process audio features + target_device = self.audio_tower.conv1.weight.device + + input_features = input_features.to(target_device) + if feature_attention_mask is not None: + feature_attention_mask = feature_attention_mask.to(target_device) + + # Compute audio feature lengths from attention mask + audio_feat_lengths, audio_output_lengths = self.audio_tower._get_feat_extract_output_lengths( + feature_attention_mask.sum(-1) + ) + + batch_size, _, max_mel_seq_len = input_features.shape + max_seq_len = (max_mel_seq_len - 2) // 2 + 1 + + # Create attention mask for audio encoder + seq_range = ( + torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device) + .unsqueeze(0) + .expand(batch_size, max_seq_len) + ) + lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len) + padding_mask = seq_range >= lengths_expand + + audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand( + batch_size, 1, max_seq_len, max_seq_len + ) + audio_attention_mask = audio_attention_mask_.to( + dtype=self.audio_tower.conv1.weight.dtype, device=target_device + ) + audio_attention_mask[audio_attention_mask_] = float("-inf") + + # Forward through audio encoder + audio_outputs = self.audio_tower(input_features, attention_mask=audio_attention_mask) + selected_audio_feature = audio_outputs.last_hidden_state + + # Project audio features to language model dimension + audio_features = self.multi_modal_projector(selected_audio_feature) + + # Check if we need legacy processing (non-expanded audio tokens) + audio_tokens = input_ids == self.audio_token_id + legacy_processing = (audio_tokens[:, :-1] & audio_tokens[:, 1:]).sum() == 0 + + if legacy_processing: + # Use HF's merge function for legacy processing + inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features( + audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels + ) + else: + # Modern processing: audio tokens are already expanded + num_audios, max_audio_tokens, embed_dim = audio_features.shape + audio_features_mask = torch.arange(max_audio_tokens, device=audio_output_lengths.device)[None, :] + audio_features_mask = audio_features_mask < audio_output_lengths[:, None] + audio_features = audio_features[audio_features_mask] + + n_audio_tokens = (input_ids == self.audio_token_id).sum().item() + n_audio_features = audio_features.shape[0] + + if n_audio_tokens != n_audio_features: + raise ValueError( + f"Audio features and audio tokens do not match: tokens: {n_audio_tokens}, features {n_audio_features}" + ) + + special_audio_mask = (input_ids == self.audio_token_id).to(inputs_embeds.device) + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds) + audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) + + # Transpose back to Megatron format [seq_len, batch, hidden] + inputs_embeds = inputs_embeds.transpose(1, 0).contiguous() + + # Forward through Megatron language model + outputs = self.language_model.forward( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + decoder_input=inputs_embeds, + labels=labels, + loss_mask=loss_mask, + runtime_gather_output=runtime_gather_output, + packed_seq_params=packed_seq_params, + ) + + return outputs + + def freeze( + self, + freeze_language_model: bool, + freeze_audio_model: bool, + freeze_audio_projection: bool, + ): + """Freeze model modules. + + Make specific modules non-trainable by setting requires_grad to False. + + Args: + freeze_language_model (bool): Freeze the language model module. + freeze_audio_model (bool): Freeze the audio model module (audio_tower). + freeze_audio_projection (bool): Freeze the audio projection module (multi_modal_projector). + """ + modules = [] + + if freeze_language_model and hasattr(self, "language_model") and self.language_model is not None: + modules.append(self.language_model) + + if freeze_audio_model and hasattr(self, "audio_tower") and self.audio_tower is not None: + modules.append(self.audio_tower) + + if ( + freeze_audio_projection + and hasattr(self, "multi_modal_projector") + and self.multi_modal_projector is not None + ): + modules.append(self.multi_modal_projector) + + for module in modules: + for param in module.parameters(): + param.requires_grad = False diff --git a/src/megatron/bridge/models/qwen_audio/qwen2_audio_bridge.py b/src/megatron/bridge/models/qwen_audio/qwen2_audio_bridge.py new file mode 100644 index 0000000000..ca8b57ded0 --- /dev/null +++ b/src/megatron/bridge/models/qwen_audio/qwen2_audio_bridge.py @@ -0,0 +1,178 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Megatron Bridge for Qwen2-Audio Models. + +This module provides the bridge implementation for converting between HuggingFace +Qwen2-Audio models and Megatron-Core format. + +Supported models: +- Qwen2-Audio-7B +- Qwen2-Audio-7B-Instruct + +Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct +""" + +from transformers import Qwen2AudioForConditionalGeneration + +from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry +from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge +from megatron.bridge.models.conversion.param_mapping import ( + AutoMapping, + GatedMLPMapping, + QKVMapping, + ReplicatedMapping, +) +from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM +from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel +from megatron.bridge.models.qwen_audio.qwen2_audio_provider import Qwen2AudioModelProvider + + +@MegatronModelBridge.register_bridge( + source=Qwen2AudioForConditionalGeneration, + target=Qwen2AudioModel, + provider=Qwen2AudioModelProvider, + model_type="qwen2_audio", +) +class Qwen2AudioBridge(MegatronModelBridge): + """ + Megatron Bridge for Qwen2-Audio Models. + + This bridge handles conversion between HuggingFace Qwen2AudioForConditionalGeneration + and Megatron-Core Qwen2AudioModel format for audio-language models. + + The weight mappings handle: + - Audio encoder weights (audio_tower) + - Language model weights + - Multimodal projector weights + + Example: + >>> from megatron.bridge import AutoBridge + >>> bridge = AutoBridge.from_hf_pretrained("Qwen/Qwen2-Audio-7B-Instruct") + >>> provider = bridge.to_megatron_provider() + """ + + def provider_bridge(self, hf_pretrained: PreTrainedVLM) -> Qwen2AudioModelProvider: + """ + Create a Qwen2AudioModelProvider from a HuggingFace pretrained model. + + Args: + hf_pretrained: HuggingFace pretrained model + + Returns: + Qwen2AudioModelProvider configured with the HF model's parameters + """ + hf_config = hf_pretrained.config + + # Qwen2-Audio has separate text_config and audio_config + text_config = getattr(hf_config, "text_config", hf_config) + + # Use base class helper for common config conversion + provider_kwargs = self.hf_config_to_provider_kwargs(text_config) + provider = Qwen2AudioModelProvider(**provider_kwargs) + + # Qwen2-specific settings + provider.normalization = "RMSNorm" + provider.gated_linear_unit = True + provider.add_qkv_bias = True + provider.add_bias_linear = False + provider.hidden_dropout = 0.0 + + # Audio-specific settings + provider.hf_config = hf_config + provider.audio_token_id = getattr(hf_config, "audio_token_index", 151646) + provider.bos_token_id = getattr(hf_config, "bos_token_id", 151643) + provider.eos_token_id = getattr(hf_config, "eos_token_id", 151645) + provider.pad_token_id = getattr(hf_config, "pad_token_id", 151643) + + return provider + + def mapping_registry(self) -> MegatronMappingRegistry: + """ + Return MegatronMappingRegistry containing parameter mappings for audio-language models. + + HuggingFace weight structure: + - language_model.model.embed_tokens.weight + - language_model.model.layers.{i}.input_layernorm.weight + - language_model.model.layers.{i}.self_attn.{q,k,v,o}_proj.weight + - language_model.model.layers.{i}.post_attention_layernorm.weight + - language_model.model.layers.{i}.mlp.{gate,up,down}_proj.weight + - language_model.model.norm.weight + - language_model.lm_head.weight + - audio_tower.** (conv1, conv2, embed_positions, layers, layer_norm, avg_pooler) + - multi_modal_projector.linear.weight + + Returns: + MegatronMappingRegistry with all parameter mappings + """ + # Language model direct mappings + # Maps: Megatron param name -> HuggingFace param name + param_mappings = { + # Embeddings and output layers + "language_model.embedding.word_embeddings.weight": "language_model.model.embed_tokens.weight", + "language_model.output_layer.weight": "language_model.lm_head.weight", + "language_model.decoder.final_layernorm.weight": "language_model.model.norm.weight", + # Layer normalization for attention and MLP + "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "language_model.model.layers.*.input_layernorm.weight", + "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "language_model.model.layers.*.post_attention_layernorm.weight", + # Attention output projection + "language_model.decoder.layers.*.self_attention.linear_proj.weight": "language_model.model.layers.*.self_attn.o_proj.weight", + # MLP output projection + "language_model.decoder.layers.*.mlp.linear_fc2.weight": "language_model.model.layers.*.mlp.down_proj.weight", + } + + mapping_list = [] + # Convert each dictionary entry to AutoMapping(megatron_param, hf_param) + for megatron_param, hf_param in param_mappings.items(): + mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param)) + + # Add special mappings that require parameter transformation + mapping_list.extend( + [ + # Audio tower weights are replicated directly + # Includes: conv1, conv2, embed_positions, layers.*.self_attn.*, layers.*.fc1, layers.*.fc2, layer_norm, avg_pooler + ReplicatedMapping( + megatron_param="audio_tower.**", + hf_param="audio_tower.**", + ), + # Multimodal projector weights (linear layer) + ReplicatedMapping( + megatron_param="multi_modal_projector.**", + hf_param="multi_modal_projector.**", + ), + # QKV: Combine separate Q, K, V matrices into single QKV matrix + QKVMapping( + megatron_param="language_model.decoder.layers.*.self_attention.linear_qkv.weight", + q="language_model.model.layers.*.self_attn.q_proj.weight", + k="language_model.model.layers.*.self_attn.k_proj.weight", + v="language_model.model.layers.*.self_attn.v_proj.weight", + ), + # QKV bias: Combine separate Q, K, V biases into single QKV bias (Qwen2 specific) + QKVMapping( + megatron_param="language_model.decoder.layers.*.self_attention.linear_qkv.bias", + q="language_model.model.layers.*.self_attn.q_proj.bias", + k="language_model.model.layers.*.self_attn.k_proj.bias", + v="language_model.model.layers.*.self_attn.v_proj.bias", + ), + # Gated MLP: Combine gate and up projection matrices into single FC1 matrix + GatedMLPMapping( + megatron_param="language_model.decoder.layers.*.mlp.linear_fc1.weight", + gate="language_model.model.layers.*.mlp.gate_proj.weight", + up="language_model.model.layers.*.mlp.up_proj.weight", + ), + ] + ) + + return MegatronMappingRegistry(*mapping_list) diff --git a/src/megatron/bridge/models/qwen_audio/qwen2_audio_provider.py b/src/megatron/bridge/models/qwen_audio/qwen2_audio_provider.py new file mode 100644 index 0000000000..b4eb069444 --- /dev/null +++ b/src/megatron/bridge/models/qwen_audio/qwen2_audio_provider.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Qwen2-Audio Model Provider configurations for Megatron-Core. + +This module provides configuration classes for Qwen2-Audio models, +compatible with HuggingFace's Qwen2-Audio model configurations. + +Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct + +Qwen2-Audio Key Features: +- Audio-language capabilities with separate language model and audio encoder +- Whisper-like audio encoder for processing mel spectrograms +- Based on Qwen2 language model architecture +""" + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +from megatron.core.models.gpt import GPTModel as MCoreGPTModel + +from megatron.bridge.models.gpt_provider import GPTModelProvider +from megatron.bridge.models.qwen.qwen_provider import Qwen2ModelProvider + + +if TYPE_CHECKING: + from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel + + +# ============================================================================= +# Qwen2-Audio Model Provider +# ============================================================================= + + +@dataclass +class Qwen2AudioModelProvider(Qwen2ModelProvider): + """ + Base model provider for Qwen2-Audio Models. + + Qwen2-Audio is a multimodal model combining a Whisper-like audio encoder + with a Qwen2 language model for audio understanding tasks. + + Reference: + - https://huggingface.co/Qwen/Qwen2-Audio-7B + - https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct + + Key Features: + - Audio encoder based on Whisper architecture + - Supports variable-length audio inputs via mel spectrograms + - Multi-turn conversation with audio context + """ + + # Audio-Language models shouldn't scatter embeddings across sequence parallel regions + # because audio embeddings are inserted into language embeddings + scatter_embedding_sequence_parallel: bool = False + + # HuggingFace config containing audio_config and text_config + hf_config: Optional[Any] = None + + # Audio-specific token IDs (defaults from Qwen2-Audio) + audio_token_id: int = 151646 # <|AUDIO|> token + + # Token IDs + bos_token_id: int = 151643 + eos_token_id: int = 151645 + pad_token_id: int = 151643 + + # Freeze options for fine-tuning + freeze_language_model: bool = False + freeze_audio_model: bool = False + freeze_audio_projection: bool = False + + def provide(self, pre_process=None, post_process=None, vp_stage=None) -> "Qwen2AudioModel": + """ + Provide a Qwen2AudioModel instance with audio and language components. + + Args: + pre_process: Whether this is the first stage in pipeline parallelism + post_process: Whether this is the last stage in pipeline parallelism + vp_stage: Virtual pipeline stage number + + Returns: + Qwen2AudioModel instance with HF audio encoder and Megatron language model + """ + from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel + + model = Qwen2AudioModel( + config=self, + pre_process=pre_process, + post_process=post_process, + vp_stage=vp_stage, + ) + + # Apply freeze options if any are enabled for fine-tuning + if self.freeze_language_model or self.freeze_audio_model or self.freeze_audio_projection: + model.freeze( + freeze_language_model=self.freeze_language_model, + freeze_audio_model=self.freeze_audio_model, + freeze_audio_projection=self.freeze_audio_projection, + ) + + return model + + def provide_language_model(self, pre_process=None, post_process=None, vp_stage=None) -> MCoreGPTModel: + """ + Provide just the language model component without audio. + + Args: + pre_process: Whether this is the first stage in pipeline parallelism + post_process: Whether this is the last stage in pipeline parallelism + vp_stage: Virtual pipeline stage number + + Returns: + MCoreGPTModel instance (language model only) + """ + return GPTModelProvider.provide(self, pre_process=pre_process, post_process=post_process, vp_stage=vp_stage) diff --git a/tests/functional_tests/L2_Launch_models_qwen_audio.sh b/tests/functional_tests/L2_Launch_models_qwen_audio.sh new file mode 100755 index 0000000000..73226fec36 --- /dev/null +++ b/tests/functional_tests/L2_Launch_models_qwen_audio.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +uv run coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest \ + -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \ + tests/functional_tests/models/qwen_audio +coverage combine -q diff --git a/tests/functional_tests/models/qwen_audio/__init__.py b/tests/functional_tests/models/qwen_audio/__init__.py new file mode 100644 index 0000000000..341a77c5bc --- /dev/null +++ b/tests/functional_tests/models/qwen_audio/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py b/tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py new file mode 100644 index 0000000000..5ac918cd5d --- /dev/null +++ b/tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py @@ -0,0 +1,191 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functional tests for Qwen2 Audio HF to Megatron generation. + +Example run commands: + # Run all generation tests + pytest tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py + + # Run specific test + pytest tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py::TestQwen2AudioGeneration::test_qwen2_audio_generation + +Note: These tests use small proxy/toy models for fast generation testing. +""" + +import json +import subprocess +import sys +from pathlib import Path + +import pytest +import torch +from transformers import AutoTokenizer, Qwen2AudioConfig, Qwen2AudioForConditionalGeneration + + +HF_QWEN2_AUDIO_TOY_MODEL_CONFIG = { + "architectures": ["Qwen2AudioForConditionalGeneration"], + "audio_token_index": 151646, + "model_type": "qwen2_audio", + "audio_config": { + "model_type": "qwen2_audio_encoder", + "num_mel_bins": 128, + "d_model": 256, + "encoder_layers": 4, + "encoder_attention_heads": 4, + "encoder_ffn_dim": 512, + "dropout": 0.0, + "attention_dropout": 0.0, + "activation_function": "gelu", + "activation_dropout": 0.0, + "encoder_layerdrop": 0.0, + "num_hidden_layers": 4, + "initializer_range": 0.02, + "scale_embedding": False, + "max_source_positions": 1500, + }, + "text_config": { + "vocab_size": 151936, + "max_position_embeddings": 32768, + "hidden_size": 256, + "intermediate_size": 512, + "num_hidden_layers": 4, + "num_attention_heads": 4, + "num_key_value_heads": 4, + "hidden_act": "silu", + "initializer_range": 0.02, + "rms_norm_eps": 1e-06, + "use_cache": True, + "rope_theta": 10000.0, + "attention_dropout": 0.0, + "tie_word_embeddings": False, + }, +} + + +class TestQwen2AudioGeneration: + """ + Test Qwen2 Audio model generation using HF to Megatron conversion with audio inputs. + Uses small proxy/toy models for fast generation testing. + """ + + @pytest.fixture(scope="class") + def qwen2_audio_toy_model_path(self, tmp_path_factory): + """ + Create and save a HuggingFace Qwen2 Audio toy model to a temporary directory. + + Args: + tmp_path_factory: Pytest temporary path factory for class-scoped fixtures + + Returns: + str: Path to the saved HuggingFace model directory + """ + # Create a temporary directory for this test class + temp_dir = tmp_path_factory.mktemp("qwen2_audio_generation_toy_model") + model_dir = temp_dir / "qwen2_audio_toy" + + # Create Qwen2 Audio config from the toy model config + config = Qwen2AudioConfig(**HF_QWEN2_AUDIO_TOY_MODEL_CONFIG) + config.torch_dtype = torch.bfloat16 + + # Create model with random weights and convert to bfloat16 + model = Qwen2AudioForConditionalGeneration(config) + model = model.to(dtype=torch.bfloat16) + + # Download and save tokenizer and processor from a reference Qwen2 Audio model + try: + from transformers import AutoProcessor + + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct") + tokenizer.save_pretrained(model_dir) + + # Also save the processor + processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct") + processor.save_pretrained(model_dir) + except Exception as e: + print(f"Warning: Could not download tokenizer/processor, creating minimal files: {e}") + # Create minimal tokenizer files if download fails + tokenizer_config = { + "tokenizer_class": "Qwen2Tokenizer", + "vocab_size": 151936, + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + } + with open(model_dir / "tokenizer_config.json", "w") as f: + json.dump(tokenizer_config, f, indent=2) + + # Save model and config to directory + model.save_pretrained(model_dir, safe_serialization=True) + + # Also save config.json explicitly + config_path = model_dir / "config.json" + with open(config_path, "w") as f: + json.dump(HF_QWEN2_AUDIO_TOY_MODEL_CONFIG, f, indent=2) + + print(f"Created toy model at: {model_dir}") + return str(model_dir) + + @pytest.mark.run_only_on("GPU") + def test_qwen2_audio_generation(self, qwen2_audio_toy_model_path): + """ + Test Qwen2 Audio toy model with audio generation. + Uses a small proxy model instead of the full 7B model for fast testing. + Uses real audio to test audio-language pipeline. + + Args: + qwen2_audio_toy_model_path: Path to the toy Qwen2 Audio model (from fixture) + """ + cmd = [ + sys.executable, + "-m", + "torch.distributed.run", + "--nproc_per_node=2", + "examples/conversion/hf_to_megatron_generate_audio_lm.py", + f"--hf_model_path={qwen2_audio_toy_model_path}", + "--audio_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", + "--prompt=What's that sound?", + "--tp=2", + "--max_new_tokens=50", + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent.parent.parent.parent, + ) + + # Print output for debugging + print("\n" + "=" * 80) + print("STDOUT:") + print(result.stdout) + print("\n" + "=" * 80) + print("STDERR:") + print(result.stderr) + print("=" * 80 + "\n") + + if result.returncode != 0: + assert False, f"Qwen2-Audio toy model generation failed with return code {result.returncode}" + + print("SUCCESS: Qwen2-Audio toy model generation test completed successfully") + + except subprocess.TimeoutExpired: + assert False, "Qwen2-Audio toy model generation test timed out after 5 minutes" + except Exception as e: + print(f"Error during Qwen2-Audio toy model generation test: {e}") + raise diff --git a/tests/unit_tests/models/qwen_audio/__init__.py b/tests/unit_tests/models/qwen_audio/__init__.py new file mode 100644 index 0000000000..2f4cc5bc8c --- /dev/null +++ b/tests/unit_tests/models/qwen_audio/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for Qwen Audio models.""" diff --git a/tests/unit_tests/models/qwen_audio/test_qwen2_audio_bridge.py b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_bridge.py new file mode 100644 index 0000000000..fc255479d2 --- /dev/null +++ b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_bridge.py @@ -0,0 +1,358 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from types import SimpleNamespace +from unittest.mock import Mock, patch + +import pytest +import torch + +from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry +from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM +from megatron.bridge.models.qwen_audio.qwen2_audio_bridge import Qwen2AudioBridge +from megatron.bridge.models.qwen_audio.qwen2_audio_provider import Qwen2AudioModelProvider + + +@pytest.fixture +def mock_text_config(): + """Create a mock text config for Qwen2-Audio.""" + text_config = SimpleNamespace( + num_hidden_layers=32, + hidden_size=4096, + intermediate_size=11008, + num_attention_heads=32, + num_key_value_heads=32, + initializer_range=0.02, + rms_norm_eps=1e-6, + vocab_size=151936, + max_position_embeddings=4096, + rope_theta=1000000.0, + tie_word_embeddings=False, + hidden_act="silu", + rope_scaling=None, + torch_dtype=torch.bfloat16, + bos_token_id=151643, + eos_token_id=151645, + ) + return text_config + + +@pytest.fixture +def mock_audio_config(): + """Create a mock audio encoder config for Qwen2-Audio.""" + audio_config = Mock() + audio_config.d_model = 1280 + audio_config.encoder_layers = 32 + audio_config.encoder_attention_heads = 20 + audio_config.encoder_ffn_dim = 5120 + return audio_config + + +@pytest.fixture +def mock_hf_config(mock_text_config, mock_audio_config): + """Create a mock HF config for Qwen2-Audio.""" + config = Mock() + config.text_config = mock_text_config + config.audio_config = mock_audio_config + config.tie_word_embeddings = False + config.audio_token_index = 151646 + config.bos_token_id = 151643 + config.eos_token_id = 151645 + config.pad_token_id = 151643 + return config + + +@pytest.fixture +def mock_hf_pretrained(mock_hf_config): + """Create a mock HF pretrained VLM.""" + pretrained = Mock(spec=PreTrainedVLM) + pretrained.config = mock_hf_config + return pretrained + + +@pytest.fixture +def qwen2_audio_bridge(): + """Create a Qwen2AudioBridge instance.""" + return Qwen2AudioBridge() + + +class TestQwen2AudioBridgeInitialization: + """Test Qwen2AudioBridge initialization and basic functionality.""" + + def test_bridge_initialization(self, qwen2_audio_bridge): + """Test that bridge can be initialized.""" + assert isinstance(qwen2_audio_bridge, Qwen2AudioBridge) + + def test_bridge_has_required_methods(self, qwen2_audio_bridge): + """Test that bridge has required methods.""" + assert hasattr(qwen2_audio_bridge, "provider_bridge") + assert callable(qwen2_audio_bridge.provider_bridge) + + assert hasattr(qwen2_audio_bridge, "mapping_registry") + assert callable(qwen2_audio_bridge.mapping_registry) + + +class TestQwen2AudioBridgeProviderBridge: + """Test provider_bridge method functionality.""" + + def test_provider_bridge_basic_config(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge creates correct provider with basic config.""" + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert isinstance(provider, Qwen2AudioModelProvider) + + # Check basic transformer config + assert provider.num_layers == 32 + assert provider.hidden_size == 4096 + assert provider.ffn_hidden_size == 11008 + assert provider.num_attention_heads == 32 + assert provider.num_query_groups == 32 + assert provider.init_method_std == 0.02 + assert provider.layernorm_epsilon == 1e-6 + assert provider.vocab_size == 151936 + assert provider.seq_length == 4096 + assert provider.rotary_base == 1000000.0 + assert provider.share_embeddings_and_output_weights is False + + def test_provider_bridge_audio_specific_config(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge creates correct audio-specific configuration.""" + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + # Check audio-specific token IDs + assert provider.audio_token_id == 151646 + assert provider.bos_token_id == 151643 + assert provider.eos_token_id == 151645 + assert provider.pad_token_id == 151643 + + # Check hf_config is propagated + assert provider.hf_config is mock_hf_pretrained.config + + # Check Qwen2-specific settings + assert provider.add_qkv_bias is True + + def test_provider_bridge_qwen2_settings(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge sets Qwen2-specific settings correctly.""" + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert provider.normalization == "RMSNorm" + assert provider.gated_linear_unit is True + assert provider.add_qkv_bias is True + assert provider.add_bias_linear is False + assert provider.hidden_dropout == 0.0 + + def test_provider_bridge_with_custom_token_ids(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge with custom token IDs from config.""" + mock_hf_pretrained.config.audio_token_index = 200000 + mock_hf_pretrained.config.bos_token_id = 200001 + mock_hf_pretrained.config.eos_token_id = 200002 + mock_hf_pretrained.config.pad_token_id = 200003 + + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert provider.audio_token_id == 200000 + assert provider.bos_token_id == 200001 + assert provider.eos_token_id == 200002 + assert provider.pad_token_id == 200003 + + def test_provider_bridge_with_tied_embeddings(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge with tied embeddings.""" + mock_hf_pretrained.config.text_config.tie_word_embeddings = True + + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert provider.share_embeddings_and_output_weights is True + + @patch.object(Qwen2AudioBridge, "dtype_from_hf") + def test_provider_bridge_dtype_handling_fp16(self, mock_dtype_from_hf, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge handles fp16 dtype correctly.""" + mock_dtype_from_hf.return_value = torch.float16 + + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert provider.fp16 is True + assert provider.bf16 is False + assert provider.params_dtype == torch.float16 + + @patch.object(Qwen2AudioBridge, "dtype_from_hf") + def test_provider_bridge_dtype_handling_bf16(self, mock_dtype_from_hf, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge handles bfloat16 dtype correctly.""" + mock_dtype_from_hf.return_value = torch.bfloat16 + + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert provider.fp16 is False + assert provider.bf16 is True + assert provider.params_dtype == torch.bfloat16 + + @patch.object(Qwen2AudioBridge, "make_vocab_size_divisible_by") + def test_provider_bridge_vocab_size_divisibility(self, mock_divisible, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge handles vocab size divisibility.""" + mock_divisible.return_value = 128 + + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + mock_divisible.assert_called_once_with(151936) + assert provider.make_vocab_size_divisible_by == 128 + + +class TestQwen2AudioBridgeMappingRegistry: + """Test mapping_registry method functionality.""" + + def _get_mapping_names(self, registry): + """Helper to extract all mapping param names from a registry.""" + mapping_names = [] + for mapping in registry.mappings: + if hasattr(mapping, "megatron_param"): + mapping_names.append(str(getattr(mapping, "megatron_param"))) + hf = getattr(mapping, "hf_param", None) + if isinstance(hf, dict): + mapping_names.extend([str(v) for v in hf.values()]) + elif isinstance(hf, str): + mapping_names.append(hf) + return mapping_names + + def test_mapping_registry_returns_correct_type(self, qwen2_audio_bridge): + """Test mapping_registry returns MegatronMappingRegistry.""" + registry = qwen2_audio_bridge.mapping_registry() + + assert isinstance(registry, MegatronMappingRegistry) + + def test_mapping_registry_contains_embeddings(self, qwen2_audio_bridge): + """Test mapping_registry contains word embeddings mapping.""" + registry = qwen2_audio_bridge.mapping_registry() + mapping_names = self._get_mapping_names(registry) + + has_embeddings = any("embed_tokens" in name or "word_embeddings" in name for name in mapping_names) + assert has_embeddings, "Should contain embeddings mapping" + + def test_mapping_registry_contains_audio_tower(self, qwen2_audio_bridge): + """Test mapping_registry contains audio_tower mapping.""" + registry = qwen2_audio_bridge.mapping_registry() + mapping_names = self._get_mapping_names(registry) + + has_audio_tower = any("audio_tower" in name for name in mapping_names) + assert has_audio_tower, "Should contain audio_tower mapping" + + def test_mapping_registry_contains_projector(self, qwen2_audio_bridge): + """Test mapping_registry contains multi_modal_projector mapping.""" + registry = qwen2_audio_bridge.mapping_registry() + mapping_names = self._get_mapping_names(registry) + + has_projector = any("multi_modal_projector" in name for name in mapping_names) + assert has_projector, "Should contain multi_modal_projector mapping" + + def test_mapping_registry_contains_qkv(self, qwen2_audio_bridge): + """Test mapping_registry contains QKV parameter mappings.""" + registry = qwen2_audio_bridge.mapping_registry() + mapping_names = self._get_mapping_names(registry) + + has_qkv = any("linear_qkv" in name for name in mapping_names) + assert has_qkv, "Should contain QKV mappings" + + def test_mapping_registry_contains_mlp(self, qwen2_audio_bridge): + """Test mapping_registry contains MLP parameter mappings.""" + registry = qwen2_audio_bridge.mapping_registry() + mapping_names = self._get_mapping_names(registry) + + has_mlp = any("mlp" in name for name in mapping_names) + assert has_mlp, "Should contain MLP mappings" + + +class TestQwen2AudioBridgeEdgeCases: + """Test edge cases and error conditions.""" + + def test_provider_bridge_with_minimal_config(self, qwen2_audio_bridge): + """Test provider_bridge with minimal HF config.""" + minimal_pretrained = Mock(spec=PreTrainedVLM) + minimal_config = Mock() + + text_config = SimpleNamespace( + num_hidden_layers=24, + hidden_size=2048, + intermediate_size=5504, + num_attention_heads=16, + num_key_value_heads=16, + initializer_range=0.02, + rms_norm_eps=1e-6, + vocab_size=151936, + max_position_embeddings=4096, + rope_theta=1000000.0, + hidden_act="silu", + tie_word_embeddings=False, + rope_scaling=None, + torch_dtype=torch.bfloat16, + ) + + minimal_config.text_config = text_config + minimal_config.tie_word_embeddings = False + minimal_config.audio_token_index = 151646 + minimal_config.bos_token_id = 151643 + minimal_config.eos_token_id = 151645 + minimal_config.pad_token_id = 151643 + minimal_pretrained.config = minimal_config + + provider = qwen2_audio_bridge.provider_bridge(minimal_pretrained) + + assert isinstance(provider, Qwen2AudioModelProvider) + assert provider.num_layers == 24 + assert provider.hidden_size == 2048 + + def test_provider_bridge_with_different_vocab_sizes(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge with different vocabulary sizes.""" + test_vocab_sizes = [32000, 151936, 152064] + + for vocab_size in test_vocab_sizes: + mock_hf_pretrained.config.text_config.vocab_size = vocab_size + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + assert provider.vocab_size == vocab_size + + def test_provider_bridge_with_different_sequence_lengths(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge with different sequence lengths.""" + test_seq_lengths = [2048, 4096, 8192, 32768] + + for seq_length in test_seq_lengths: + mock_hf_pretrained.config.text_config.max_position_embeddings = seq_length + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + assert provider.seq_length == seq_length + + +class TestQwen2AudioBridgeCompatibility: + """Test compatibility with different HF model configurations.""" + + def test_provider_bridge_with_group_query_attention(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge with group query attention.""" + mock_hf_pretrained.config.text_config.num_attention_heads = 32 + mock_hf_pretrained.config.text_config.num_key_value_heads = 8 + + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert provider.num_attention_heads == 32 + assert provider.num_query_groups == 8 + + def test_provider_bridge_with_different_rope_theta(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge with different RoPE theta values.""" + test_rope_values = [10000.0, 500000.0, 1000000.0] + + for rope_theta in test_rope_values: + mock_hf_pretrained.config.text_config.rope_theta = rope_theta + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + assert provider.rotary_base == rope_theta + + def test_provider_bridge_with_missing_audio_token_index(self, qwen2_audio_bridge, mock_hf_pretrained): + """Test provider_bridge with missing audio_token_index uses default.""" + delattr(mock_hf_pretrained.config, "audio_token_index") + + provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained) + + assert provider.audio_token_id == 151646 diff --git a/tests/unit_tests/models/qwen_audio/test_qwen2_audio_provider.py b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_provider.py new file mode 100644 index 0000000000..4b53cb3212 --- /dev/null +++ b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_provider.py @@ -0,0 +1,209 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from megatron.bridge.models.qwen_audio import Qwen2AudioModelProvider + + +class TestQwen2AudioModelProvider: + """Test cases for Qwen2AudioModelProvider class.""" + + def test_initialization(self): + """Test Qwen2AudioModelProvider can be initialized with default values.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + ) + + assert provider.num_layers == 32 + assert provider.hidden_size == 4096 + assert provider.num_attention_heads == 32 + + def test_audio_specific_defaults(self): + """Test Qwen2AudioModelProvider audio-specific default configuration.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + ) + + # Audio-language models shouldn't scatter embeddings + assert provider.scatter_embedding_sequence_parallel is False + + # HF config defaults to None + assert provider.hf_config is None + + # Audio-specific token ID + assert provider.audio_token_id == 151646 + + # Token IDs + assert provider.bos_token_id == 151643 + assert provider.eos_token_id == 151645 + assert provider.pad_token_id == 151643 + + # Freeze options defaults + assert provider.freeze_language_model is False + assert provider.freeze_audio_model is False + assert provider.freeze_audio_projection is False + + def test_custom_token_ids(self): + """Test Qwen2AudioModelProvider with custom token IDs.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + audio_token_id=200, + bos_token_id=201, + eos_token_id=202, + pad_token_id=203, + ) + + assert provider.audio_token_id == 200 + assert provider.bos_token_id == 201 + assert provider.eos_token_id == 202 + assert provider.pad_token_id == 203 + + def test_freeze_options(self): + """Test Qwen2AudioModelProvider with freeze options.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + freeze_language_model=True, + freeze_audio_model=True, + freeze_audio_projection=True, + ) + + assert provider.freeze_language_model is True + assert provider.freeze_audio_model is True + assert provider.freeze_audio_projection is True + + def test_custom_hf_config(self): + """Test Qwen2AudioModelProvider with custom hf_config.""" + dummy_config = {"text_config": {}, "audio_config": {}} + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + hf_config=dummy_config, + ) + + assert provider.hf_config is dummy_config + + def test_provide_method_exists(self): + """Test that provide method exists and is callable.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + ) + + assert hasattr(provider, "provide") + assert callable(provider.provide) + + def test_provide_language_model_method_exists(self): + """Test that provide_language_model method exists and is callable.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + ) + + assert hasattr(provider, "provide_language_model") + assert callable(provider.provide_language_model) + + def test_inherit_from_qwen2_provider(self): + """Test that Qwen2AudioModelProvider inherits Qwen2 configurations correctly.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + seq_length=8192, + vocab_size=152064, + rotary_base=500000.0, + ) + + # Check that inherited configurations work + assert provider.seq_length == 8192 + assert provider.vocab_size == 152064 + assert provider.rotary_base == 500000.0 + + # Qwen2 defaults should be inherited + assert provider.normalization == "RMSNorm" + assert provider.gated_linear_unit is True + assert provider.add_qkv_bias is True + assert provider.add_bias_linear is False + + # Audio-specific overrides should still work + assert provider.scatter_embedding_sequence_parallel is False + + def test_edge_cases(self): + """Test edge cases and boundary conditions.""" + # Test with minimal valid configuration + provider = Qwen2AudioModelProvider( + num_layers=1, + hidden_size=64, + num_attention_heads=1, + ) + + assert provider.num_layers == 1 + assert provider.hidden_size == 64 + assert provider.num_attention_heads == 1 + assert provider.scatter_embedding_sequence_parallel is False + + # Test with large configuration + provider_large = Qwen2AudioModelProvider( + num_layers=80, + hidden_size=8192, + num_attention_heads=64, + num_query_groups=8, + ) + + assert provider_large.num_layers == 80 + assert provider_large.hidden_size == 8192 + assert provider_large.num_attention_heads == 64 + assert provider_large.num_query_groups == 8 + + +class TestQwen2AudioModelProviderInheritance: + """Test inheritance relationships for Qwen2AudioModelProvider.""" + + def test_inherits_from_gpt_provider(self): + """Test that Qwen2AudioModelProvider inherits from GPTModelProvider.""" + from megatron.bridge.models.gpt_provider import GPTModelProvider + + assert issubclass(Qwen2AudioModelProvider, GPTModelProvider) + + def test_inherits_from_qwen2_provider(self): + """Test that Qwen2AudioModelProvider inherits from Qwen2ModelProvider.""" + from megatron.bridge.models.qwen.qwen_provider import Qwen2ModelProvider + + assert issubclass(Qwen2AudioModelProvider, Qwen2ModelProvider) + + def test_provider_method_inheritance(self): + """Test that inherited methods work correctly.""" + provider = Qwen2AudioModelProvider( + num_layers=32, + hidden_size=4096, + num_attention_heads=32, + ) + + # Should inherit all Qwen2ModelProvider methods + assert hasattr(provider, "provide") + assert hasattr(provider, "provide_language_model") + + # Audio-specific fields should also exist + assert hasattr(provider, "freeze_language_model") + assert hasattr(provider, "freeze_audio_model") + assert hasattr(provider, "freeze_audio_projection")