diff --git a/examples/conversion/hf_to_megatron_generate_audio_lm.py b/examples/conversion/hf_to_megatron_generate_audio_lm.py
new file mode 100644
index 0000000000..c37852dfbe
--- /dev/null
+++ b/examples/conversion/hf_to_megatron_generate_audio_lm.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Audio-Language Model Generation Script for Qwen2-Audio.
+
+This script demonstrates how to use Qwen2-Audio models with Megatron-Bridge
+for audio understanding tasks.
+
+Example:
+  # Audio-Language generation with audio from URL:
+  uv run python examples/conversion/hf_to_megatron_generate_alm.py \
+    --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \
+    --audio_url="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" \
+    --prompt="What's that sound?"
+
+  # Audio-Language generation with local audio file:
+  uv run python examples/conversion/hf_to_megatron_generate_alm.py \
+    --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \
+    --audio_path="/path/to/audio.wav" \
+    --prompt="Describe what you hear in this audio."
+
+  # Text-only generation (no audio):
+  uv run python examples/conversion/hf_to_megatron_generate_alm.py \
+    --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \
+    --prompt="Hello, how are you?"
+
+  # Load from Megatron checkpoint:
+  uv run python examples/conversion/hf_to_megatron_generate_alm.py \
+    --hf_model_path="Qwen/Qwen2-Audio-7B-Instruct" \
+    --megatron_model_path="/path/to/megatron/checkpoint" \
+    --audio_url="https://example.com/audio.mp3" \
+    --prompt="What's in this audio?"
+"""
+
+import argparse
+from io import BytesIO
+from typing import Optional
+from urllib.request import urlopen
+
+import torch
+import torch.distributed as dist
+from megatron.core import parallel_state
+from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+from transformers import AutoProcessor, AutoTokenizer
+
+from megatron.bridge import AutoBridge
+from megatron.bridge.models.hf_pretrained.utils import is_safe_repo
+from megatron.bridge.utils.common_utils import get_last_rank, print_rank_0
+
+
+# Try to import librosa for audio loading
+try:
+    import librosa
+
+    HAS_LIBROSA = True
+except ImportError:
+    librosa = None
+    HAS_LIBROSA = False
+
+
+class SingleBatchIterator:
+    """Iterator that yields a single batch of data for audio-language generation.
+    Required by the forward_backward_func function.
+
+    This class creates an iterator that yields exactly one batch containing
+    input tokens, position IDs, attention mask, and optional audio inputs,
+    then raises StopIteration. Used for single-step inference in the forward pass.
+    """
+
+    def __init__(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        input_features=None,
+        feature_attention_mask=None,
+    ):
+        self.batch = dict(
+            tokens=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+        )
+
+        # Add audio inputs if provided
+        if input_features is not None:
+            self.batch["input_features"] = input_features
+        if feature_attention_mask is not None:
+            self.batch["feature_attention_mask"] = feature_attention_mask
+
+        self._yielded = False
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self._yielded:
+            raise StopIteration
+        self._yielded = True
+        return self.batch
+
+
+def alm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor:
+    """Forward step function for audio-language generation.
+    Required by the forward_backward_func function.
+
+    Extracts a batch from the data iterator and runs the model forward pass
+    with the provided input tokens, position IDs, attention mask, and audio inputs.
+
+    Args:
+        data_iterator: Iterator providing batches of input data
+        model: The Megatron model to run forward pass on
+        **kwargs: Additional keyword arguments (unused)
+
+    Returns:
+        Tuple of (model_output, loss_function)
+    """
+    batch = next(data_iterator)
+    forward_args = {
+        "input_ids": batch["tokens"],
+        "position_ids": batch["position_ids"],
+        "attention_mask": batch.get("attention_mask", None),
+    }
+
+    # Add audio inputs if present
+    if "input_features" in batch:
+        forward_args["input_features"] = batch["input_features"]
+    if "feature_attention_mask" in batch:
+        forward_args["feature_attention_mask"] = batch["feature_attention_mask"]
+
+    def loss_func(x, **kwargs):
+        return x
+
+    model_output = model(**forward_args)
+    if isinstance(model_output, tuple):
+        output_tensor, _ = model_output
+    else:
+        output_tensor = model_output
+
+    return output_tensor, loss_func
+
+
+def load_audio(audio_path: str, sampling_rate: int = 16000):
+    """Load an audio file from URL or file path.
+
+    Args:
+        audio_path: URL or local file path to the audio file
+        sampling_rate: Target sampling rate for the audio
+
+    Returns:
+        Audio data as numpy array
+    """
+    if not HAS_LIBROSA:
+        raise ImportError("librosa is required for audio loading. Please install it: pip install librosa")
+
+    if audio_path.startswith(("http://", "https://")):
+        audio_data, _ = librosa.load(BytesIO(urlopen(audio_path).read()), sr=sampling_rate)
+    else:
+        audio_data, _ = librosa.load(audio_path, sr=sampling_rate)
+
+    return audio_data
+
+
+def process_audio_inputs(processor, audio_path: Optional[str], prompt: str):
+    """Process audio inputs for audio-language model.
+
+    Args:
+        processor: AutoProcessor for the audio-language model
+        audio_path: Path or URL to the audio file (optional)
+        prompt: Text prompt
+
+    Returns:
+        Tuple of (input_ids, input_features, feature_attention_mask, messages)
+    """
+    if audio_path:
+        # Get sampling rate from processor
+        sampling_rate = processor.feature_extractor.sampling_rate
+
+        # Load audio
+        audio_data = load_audio(audio_path, sampling_rate)
+
+        # Create messages with audio and text for Qwen2-Audio format
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "audio_url": audio_path},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+
+        # Apply chat template
+        text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+
+        # Process inputs with audio
+        inputs = processor(text=text, audio=[audio_data], return_tensors="pt", padding=True)
+
+        return (
+            inputs.input_ids,
+            inputs.input_features,
+            getattr(inputs, "feature_attention_mask", None),
+            messages,
+        )
+    else:
+        # Text-only processing
+        messages = [{"role": "user", "content": prompt}]
+        text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        inputs = processor(text=text, return_tensors="pt")
+        return inputs.input_ids, None, None, messages
+
+
+def main(args) -> None:
+    """Main function for audio-language generation from HuggingFace models.
+
+    Loads an audio-language model either from HuggingFace (with optional conversion to Megatron)
+    or directly from a Megatron checkpoint, then performs greedy generation
+    using the provided prompt and optional audio input.
+
+    Args:
+        args: Parsed command line arguments containing model paths, prompt,
+              audio path, parallelism settings, and generation parameters
+    """
+    tp = args.tp
+    pp = args.pp
+    ep = args.ep
+    etp = args.etp
+
+    # Choose loading method based on arguments
+    if args.megatron_model_path:
+        # Load from Megatron checkpoint
+        print_rank_0(f"Loading Megatron model from: {args.megatron_model_path}")
+
+        # We still need HF config for tokenizer, but we'll load the model from Megatron checkpoint
+        # Create bridge from HF config only (no weights)
+        bridge = AutoBridge.from_hf_pretrained(args.hf_model_path)
+
+        # Initialize model parallel before loading
+        model_provider = bridge.to_megatron_provider(load_weights=False)
+        model_provider.tensor_model_parallel_size = tp
+        model_provider.pipeline_model_parallel_size = pp
+        model_provider.expert_model_parallel_size = ep
+        model_provider.expert_tensor_parallel_size = etp
+        model_provider.pipeline_dtype = torch.bfloat16
+        model_provider.finalize()
+        model_provider.initialize_model_parallel(seed=0)
+
+        # Load the Megatron model directly
+        model = bridge.load_megatron_model(
+            args.megatron_model_path,
+            mp_overrides={
+                "tensor_model_parallel_size": tp,
+                "pipeline_model_parallel_size": pp,
+                "expert_model_parallel_size": ep,
+                "expert_tensor_parallel_size": etp,
+                "pipeline_dtype": torch.bfloat16,
+            },
+            wrap_with_ddp=False,
+        )
+
+    else:
+        # Load from HuggingFace and convert to Megatron
+        print_rank_0(f"Loading HuggingFace model from: {args.hf_model_path}")
+        bridge = AutoBridge.from_hf_pretrained(args.hf_model_path)
+        model_provider = bridge.to_megatron_provider(load_weights=True)
+        model_provider.tensor_model_parallel_size = tp
+        model_provider.pipeline_model_parallel_size = pp
+        model_provider.expert_model_parallel_size = ep
+        model_provider.expert_tensor_parallel_size = etp
+        model_provider.pipeline_dtype = torch.bfloat16
+        model_provider.finalize()
+        model_provider.initialize_model_parallel(seed=0)
+        model = model_provider.provide_distributed_model(wrap_with_ddp=False)
+
+    model = [m.cuda() for m in model]
+    for m in model:
+        m.eval()
+
+    # Set grad_scale_func to None on the model's config for inference
+    for m in model:
+        if hasattr(m, "config"):
+            m.config.grad_scale_func = None
+
+    # Initialize tokenizer and processor
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.hf_model_path,
+        trust_remote_code=is_safe_repo(
+            trust_remote_code=args.trust_remote_code,
+            hf_path=args.hf_model_path,
+        ),
+    )
+    processor = AutoProcessor.from_pretrained(
+        args.hf_model_path,
+        trust_remote_code=is_safe_repo(
+            trust_remote_code=args.trust_remote_code,
+            hf_path=args.hf_model_path,
+        ),
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # Determine audio path (URL or file)
+    audio_path = args.audio_url or args.audio_path
+
+    # Process inputs (text and audio if provided)
+    prompt = args.prompt
+    input_ids, input_features, feature_attention_mask, messages = process_audio_inputs(processor, audio_path, prompt)
+
+    # Move to GPU
+    input_ids = input_ids.cuda()
+    if input_features is not None:
+        input_features = input_features.cuda()
+    if feature_attention_mask is not None:
+        feature_attention_mask = feature_attention_mask.cuda()
+
+    position_ids = (
+        torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
+    )
+    attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+    generated_ids = input_ids.clone()
+
+    stop_tokens = [tokenizer.eos_token_id]
+
+    # Greedy generation loop
+    for step in range(args.max_new_tokens):
+        with torch.no_grad():
+            print_rank_0(f"Generation step {step}")
+
+            fwd_bwd_function = get_forward_backward_func()
+
+            # Keep passing audio inputs for all steps to ensure audio features are available
+            iterator = SingleBatchIterator(
+                input_ids, position_ids, attention_mask, input_features, feature_attention_mask
+            )
+
+            output = fwd_bwd_function(
+                forward_step_func=alm_forward_step,
+                data_iterator=iterator,
+                model=model,
+                num_microbatches=1,
+                forward_only=True,
+                seq_length=input_ids.size(1),
+                micro_batch_size=1,
+                collect_non_loss_data=True,
+            )
+            if isinstance(output, list) and len(output) > 0:
+                output = output[0]
+
+            if parallel_state.is_pipeline_last_stage():
+                world_size = parallel_state.get_tensor_model_parallel_world_size()
+                gathered_tensors = [torch.zeros_like(output) for _ in range(world_size)]
+                # All-gather operation
+                dist.all_gather(gathered_tensors, output, group=parallel_state.get_tensor_model_parallel_group())
+                # Concatenate along last dimension (dim=2)
+                output = torch.cat(gathered_tensors, dim=2)
+                next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True)
+
+                # Debug: print token information
+                if step < 5:  # Only for first few iterations
+                    print_rank_0(f"Step {step}: output shape={output.shape}, var={output.var():.4f}")
+                    logits = output[0, -1, :]
+                    top5_vals, top5_ids = torch.topk(logits, 5)
+                    top5_tokens = [tokenizer.decode([idx]) for idx in top5_ids]
+                    print_rank_0(f"Top 5: {list(zip(top5_tokens, top5_vals.tolist()))}")
+                    print_rank_0(
+                        f"Selected: '{tokenizer.decode([next_token_ids.item()])}' (id={next_token_ids.item()})"
+                    )
+            else:
+                next_token_ids = torch.ones((1, 1), device=generated_ids.device, dtype=generated_ids.dtype)
+
+            torch.distributed.broadcast(next_token_ids, get_last_rank())
+            generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
+
+            input_ids = generated_ids
+            position_ids = (
+                torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device)
+                .unsqueeze(0)
+                .expand_as(input_ids)
+            )
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+
+            # If the generated token is the end of sequence token, stop generating
+            if next_token_ids.item() in stop_tokens:
+                break
+
+    # Decode the generated sequence
+    generated_text = tokenizer.decode(list(generated_ids[0]), skip_special_tokens=True)
+    print_rank_0("======== GENERATED TEXT OUTPUT ========")
+    if audio_path:
+        print_rank_0(f"Audio: {audio_path}")
+    print_rank_0(f"Prompt: {prompt}")
+    print_rank_0(f"Generated: {generated_text}")
+    print_rank_0("=======================================")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Audio-Language Generation from HuggingFace Audio-Language Models")
+    parser.add_argument(
+        "--hf_model_path",
+        type=str,
+        required=True,
+        help="Path to the HuggingFace audio-language model.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="What's that sound?",
+        help="Input prompt for audio-language generation.",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=50,
+        help="Maximum number of new tokens to generate.",
+    )
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism size")
+    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism size")
+    parser.add_argument("--ep", type=int, default=1, help="Expert parallelism size")
+    parser.add_argument("--etp", type=int, default=1, help="Expert tensor parallelism size")
+    parser.add_argument("--megatron_model_path", type=str, default=None, help="Path to the Megatron model checkpoint")
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default=None,
+        help="Local path to the audio file for audio-language generation (optional).",
+    )
+    parser.add_argument(
+        "--audio_url",
+        type=str,
+        default=None,
+        help="URL to the audio file for audio-language generation (optional).",
+    )
+    parser.add_argument("--trust_remote_code", action="store_true", help="if trust_remote_code")
+    args = parser.parse_args()
+
+    main(args)
+
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py
index 2400b55f48..6f00e23e26 100644
--- a/src/megatron/bridge/models/__init__.py
+++ b/src/megatron/bridge/models/__init__.py
@@ -121,6 +121,11 @@
     Qwen25ModelProvider72B,
     Qwen25ModelProvider500M,
 )
+from megatron.bridge.models.qwen_audio import (
+    Qwen2AudioBridge,
+    Qwen2AudioModel,
+    Qwen2AudioModelProvider,
+)
 from megatron.bridge.models.qwen_omni import (
     Qwen25OmniBridge,
     Qwen25OmniModel,
@@ -224,6 +229,10 @@
     "MimoBridge",
     # Nemotron Models
     "NemotronBridge",
+    # Audio-Language Models
+    "Qwen2AudioBridge",
+    "Qwen2AudioModel",
+    "Qwen2AudioModelProvider",
     # VL Models
     "Qwen25VLModel",
     "Qwen25VLBridge",
diff --git a/src/megatron/bridge/models/qwen_audio/__init__.py b/src/megatron/bridge/models/qwen_audio/__init__.py
new file mode 100644
index 0000000000..c7885b5594
--- /dev/null
+++ b/src/megatron/bridge/models/qwen_audio/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Qwen2-Audio Model Bridge and Provider implementations.
+
+This module provides support for Qwen2-Audio audio-language models.
+
+Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
+
+Supported models:
+- Qwen2-Audio-7B
+- Qwen2-Audio-7B-Instruct
+
+Example usage:
+    >>> from megatron.bridge import AutoBridge
+    >>> bridge = AutoBridge.from_hf_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
+    >>> provider = bridge.to_megatron_provider()
+"""
+
+from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel
+from megatron.bridge.models.qwen_audio.qwen2_audio_bridge import Qwen2AudioBridge
+from megatron.bridge.models.qwen_audio.qwen2_audio_provider import (
+    Qwen2AudioModelProvider,
+)
+
+
+__all__ = [
+    # Bridge
+    "Qwen2AudioBridge",
+    # Model
+    "Qwen2AudioModel",
+    # Model Providers
+    "Qwen2AudioModelProvider",
+]
diff --git a/src/megatron/bridge/models/qwen_audio/modeling_qwen2_audio.py b/src/megatron/bridge/models/qwen_audio/modeling_qwen2_audio.py
new file mode 100644
index 0000000000..c87c70f78d
--- /dev/null
+++ b/src/megatron/bridge/models/qwen_audio/modeling_qwen2_audio.py
@@ -0,0 +1,317 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Qwen2-Audio Model for Megatron.
+
+This module provides the Qwen2AudioModel class that combines:
+- HuggingFace's audio encoder (audio_tower) for processing mel spectrograms
+- HuggingFace's multimodal projector for audio-to-language projection
+- Megatron's language model for text generation
+
+Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
+"""
+
+import types
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from megatron.core.transformer.module import MegatronModule
+from torch import Tensor
+
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+from megatron.bridge.utils.common_utils import hook_hf_module_setattr_for_tp_grad_sync
+
+
+if TYPE_CHECKING:
+    from megatron.core.packed_seq_params import PackedSeqParams
+
+
+# Import HuggingFace Qwen2Audio model classes with fallback
+try:
+    from transformers import Qwen2AudioForConditionalGeneration
+    from transformers.models.qwen2_audio.modeling_qwen2_audio import (
+        Qwen2AudioEncoder,
+        Qwen2AudioMultiModalProjector,
+    )
+
+    HAS_QWEN2_AUDIO = True
+except ImportError:
+    Qwen2AudioForConditionalGeneration = None
+    Qwen2AudioEncoder = None
+    Qwen2AudioMultiModalProjector = None
+    HAS_QWEN2_AUDIO = False
+
+
+class Qwen2AudioModel(MegatronModule):
+    """
+    Qwen2-Audio Model wrapper for Megatron.
+
+    This class combines HuggingFace's audio components with Megatron's language model:
+    - Audio tower (HF): Processes mel spectrograms through Whisper-like encoder
+    - Multimodal projector (HF): Projects audio features to language model space
+    - Language model (Megatron): Generates text conditioned on audio and text inputs
+
+    The audio encoder forward pass uses HuggingFace implementation,
+    while the language model forward pass uses Megatron's optimized implementation.
+
+    Args:
+        config (GPTModelProvider): Model provider containing configuration for language and audio modules.
+        pre_process (bool, optional): Whether to construct the audio tower and projector. Default: True.
+        post_process (bool, optional): Whether to apply post-processing. Default: True.
+        vp_stage (Optional[int], optional): Pipeline stage for model parallelism. Default: None.
+
+    Attributes:
+        pre_process (bool): If True, enables audio and multimodal components.
+        post_process (bool): If True, enables post-processing.
+        vp_stage (Optional[int]): Pipeline stage for model parallelism.
+        audio_tower (nn.Module): Audio encoder from HuggingFace (Whisper-like).
+        multi_modal_projector (nn.Module): Projects audio features to language model space.
+        language_model (nn.Module): Megatron language model.
+
+    Forward Inputs:
+        input_ids (torch.LongTensor, optional): Tokenized input ids for the language model.
+        attention_mask (torch.Tensor, optional): Attention mask for the language model.
+        position_ids (torch.LongTensor, optional): Position ids for the language model.
+        inputs_embeds (torch.FloatTensor, optional): Precomputed input embeddings.
+        input_features (torch.Tensor, optional): Mel spectrogram features for audio.
+        feature_attention_mask (torch.Tensor, optional): Attention mask for audio features.
+        labels (torch.Tensor, optional): Target labels for supervised training.
+        runtime_gather_output (bool, optional): If True, gather outputs across pipeline stages.
+        loss_mask (Tensor, optional): Mask for loss computation.
+
+    Returns:
+        Tensor: Model output (e.g., logits or loss, depending on mode).
+
+    Note:
+        - If `pre_process` is False, only the language model is constructed.
+        - The audio tower and projector are only active if `pre_process` is True.
+        - This class is intended for use within the Megatron-LM framework.
+    """
+
+    def __init__(
+        self,
+        config: GPTModelProvider,
+        pre_process: bool = True,
+        post_process: bool = True,
+        vp_stage: Optional[int] = None,
+    ) -> None:
+        super().__init__(config=config)
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.vp_stage = vp_stage
+
+        if pre_process:
+            if not HAS_QWEN2_AUDIO:
+                raise ImportError(
+                    "Qwen2Audio model requires transformers with Qwen2Audio support. "
+                    "Please upgrade: pip install 'transformers>=4.40.0'"
+                )
+
+            # Initialize audio tower from HuggingFace config
+            # The audio_tower is a Whisper-like encoder that processes mel spectrograms
+            self.audio_tower = Qwen2AudioEncoder(config.hf_config.audio_config)
+
+            # Initialize multimodal projector from HuggingFace config
+            # Projects audio encoder output dimension to language model hidden size
+            self.multi_modal_projector = Qwen2AudioMultiModalProjector(config.hf_config)
+
+            # Ensure HF audio tower params are marked for TP grad sync
+            hook_hf_module_setattr_for_tp_grad_sync(self.audio_tower)
+            hook_hf_module_setattr_for_tp_grad_sync(self.multi_modal_projector)
+
+        # Initialize Megatron language model
+        self.language_model = self.config.provide_language_model(
+            pre_process=pre_process, post_process=post_process, vp_stage=vp_stage
+        )
+
+        # Finalize grad requires these to be bound with module
+        self.share_embeddings_and_output_weights = config.share_embeddings_and_output_weights
+        self.shared_embedding_or_output_weight = self.language_model.shared_embedding_or_output_weight
+
+        # Monkey-patch methods from HuggingFace Qwen2AudioForConditionalGeneration
+        if HAS_QWEN2_AUDIO and Qwen2AudioForConditionalGeneration is not None:
+            self._merge_input_ids_with_audio_features = types.MethodType(
+                Qwen2AudioForConditionalGeneration._merge_input_ids_with_audio_features, self
+            )
+
+        # Store audio token id from config
+        self.audio_token_id = getattr(config, "audio_token_id", 151646)
+        self.pad_token_id = getattr(config.hf_config, "pad_token_id", -1)
+
+    def set_input_tensor(self, input_tensor) -> None:
+        """Set model chunk input tensor."""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        feature_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        runtime_gather_output: Optional[bool] = None,
+        packed_seq_params: Optional["PackedSeqParams"] = None,
+        *,
+        loss_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+        Forward pass combining HuggingFace audio encoder with Megatron language model.
+
+        Args:
+            input_ids: Tokenized input ids for the language model.
+            attention_mask: Attention mask for the language model.
+            position_ids: Position ids for the language model.
+            inputs_embeds: Precomputed input embeddings.
+            input_features: Mel spectrogram features for audio input.
+            feature_attention_mask: Attention mask for audio features.
+            labels: Target labels for supervised training.
+            runtime_gather_output: If True, gather outputs across pipeline stages.
+            loss_mask: Mask for loss computation.
+
+        Returns:
+            Tensor: Model output containing logits or loss.
+        """
+        if self.pre_process:
+            if inputs_embeds is None:
+                # Get text embeddings from Megatron language model
+                inputs_embeds = self.language_model.embedding(
+                    input_ids=input_ids, position_ids=None
+                )  # [seq_len, batch, hidden]
+
+                # Transpose to HF format [batch, seq_len, hidden]
+                inputs_embeds = inputs_embeds.transpose(1, 0).contiguous()
+
+            if input_features is not None and input_ids.shape[1] != 1:
+                # Process audio features
+                target_device = self.audio_tower.conv1.weight.device
+
+                input_features = input_features.to(target_device)
+                if feature_attention_mask is not None:
+                    feature_attention_mask = feature_attention_mask.to(target_device)
+
+                # Compute audio feature lengths from attention mask
+                audio_feat_lengths, audio_output_lengths = self.audio_tower._get_feat_extract_output_lengths(
+                    feature_attention_mask.sum(-1)
+                )
+
+                batch_size, _, max_mel_seq_len = input_features.shape
+                max_seq_len = (max_mel_seq_len - 2) // 2 + 1
+
+                # Create attention mask for audio encoder
+                seq_range = (
+                    torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
+                    .unsqueeze(0)
+                    .expand(batch_size, max_seq_len)
+                )
+                lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
+                padding_mask = seq_range >= lengths_expand
+
+                audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+                    batch_size, 1, max_seq_len, max_seq_len
+                )
+                audio_attention_mask = audio_attention_mask_.to(
+                    dtype=self.audio_tower.conv1.weight.dtype, device=target_device
+                )
+                audio_attention_mask[audio_attention_mask_] = float("-inf")
+
+                # Forward through audio encoder
+                audio_outputs = self.audio_tower(input_features, attention_mask=audio_attention_mask)
+                selected_audio_feature = audio_outputs.last_hidden_state
+
+                # Project audio features to language model dimension
+                audio_features = self.multi_modal_projector(selected_audio_feature)
+
+                # Check if we need legacy processing (non-expanded audio tokens)
+                audio_tokens = input_ids == self.audio_token_id
+                legacy_processing = (audio_tokens[:, :-1] & audio_tokens[:, 1:]).sum() == 0
+
+                if legacy_processing:
+                    # Use HF's merge function for legacy processing
+                    inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
+                        audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                else:
+                    # Modern processing: audio tokens are already expanded
+                    num_audios, max_audio_tokens, embed_dim = audio_features.shape
+                    audio_features_mask = torch.arange(max_audio_tokens, device=audio_output_lengths.device)[None, :]
+                    audio_features_mask = audio_features_mask < audio_output_lengths[:, None]
+                    audio_features = audio_features[audio_features_mask]
+
+                    n_audio_tokens = (input_ids == self.audio_token_id).sum().item()
+                    n_audio_features = audio_features.shape[0]
+
+                    if n_audio_tokens != n_audio_features:
+                        raise ValueError(
+                            f"Audio features and audio tokens do not match: tokens: {n_audio_tokens}, features {n_audio_features}"
+                        )
+
+                    special_audio_mask = (input_ids == self.audio_token_id).to(inputs_embeds.device)
+                    special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                    audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
+
+            # Transpose back to Megatron format [seq_len, batch, hidden]
+            inputs_embeds = inputs_embeds.transpose(1, 0).contiguous()
+
+        # Forward through Megatron language model
+        outputs = self.language_model.forward(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            decoder_input=inputs_embeds,
+            labels=labels,
+            loss_mask=loss_mask,
+            runtime_gather_output=runtime_gather_output,
+            packed_seq_params=packed_seq_params,
+        )
+
+        return outputs
+
+    def freeze(
+        self,
+        freeze_language_model: bool,
+        freeze_audio_model: bool,
+        freeze_audio_projection: bool,
+    ):
+        """Freeze model modules.
+
+        Make specific modules non-trainable by setting requires_grad to False.
+
+        Args:
+            freeze_language_model (bool): Freeze the language model module.
+            freeze_audio_model (bool): Freeze the audio model module (audio_tower).
+            freeze_audio_projection (bool): Freeze the audio projection module (multi_modal_projector).
+        """
+        modules = []
+
+        if freeze_language_model and hasattr(self, "language_model") and self.language_model is not None:
+            modules.append(self.language_model)
+
+        if freeze_audio_model and hasattr(self, "audio_tower") and self.audio_tower is not None:
+            modules.append(self.audio_tower)
+
+        if (
+            freeze_audio_projection
+            and hasattr(self, "multi_modal_projector")
+            and self.multi_modal_projector is not None
+        ):
+            modules.append(self.multi_modal_projector)
+
+        for module in modules:
+            for param in module.parameters():
+                param.requires_grad = False
diff --git a/src/megatron/bridge/models/qwen_audio/qwen2_audio_bridge.py b/src/megatron/bridge/models/qwen_audio/qwen2_audio_bridge.py
new file mode 100644
index 0000000000..ca8b57ded0
--- /dev/null
+++ b/src/megatron/bridge/models/qwen_audio/qwen2_audio_bridge.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Megatron Bridge for Qwen2-Audio Models.
+
+This module provides the bridge implementation for converting between HuggingFace
+Qwen2-Audio models and Megatron-Core format.
+
+Supported models:
+- Qwen2-Audio-7B
+- Qwen2-Audio-7B-Instruct
+
+Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
+"""
+
+from transformers import Qwen2AudioForConditionalGeneration
+
+from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.conversion.param_mapping import (
+    AutoMapping,
+    GatedMLPMapping,
+    QKVMapping,
+    ReplicatedMapping,
+)
+from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM
+from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel
+from megatron.bridge.models.qwen_audio.qwen2_audio_provider import Qwen2AudioModelProvider
+
+
+@MegatronModelBridge.register_bridge(
+    source=Qwen2AudioForConditionalGeneration,
+    target=Qwen2AudioModel,
+    provider=Qwen2AudioModelProvider,
+    model_type="qwen2_audio",
+)
+class Qwen2AudioBridge(MegatronModelBridge):
+    """
+    Megatron Bridge for Qwen2-Audio Models.
+
+    This bridge handles conversion between HuggingFace Qwen2AudioForConditionalGeneration
+    and Megatron-Core Qwen2AudioModel format for audio-language models.
+
+    The weight mappings handle:
+    - Audio encoder weights (audio_tower)
+    - Language model weights
+    - Multimodal projector weights
+
+    Example:
+        >>> from megatron.bridge import AutoBridge
+        >>> bridge = AutoBridge.from_hf_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
+        >>> provider = bridge.to_megatron_provider()
+    """
+
+    def provider_bridge(self, hf_pretrained: PreTrainedVLM) -> Qwen2AudioModelProvider:
+        """
+        Create a Qwen2AudioModelProvider from a HuggingFace pretrained model.
+
+        Args:
+            hf_pretrained: HuggingFace pretrained model
+
+        Returns:
+            Qwen2AudioModelProvider configured with the HF model's parameters
+        """
+        hf_config = hf_pretrained.config
+
+        # Qwen2-Audio has separate text_config and audio_config
+        text_config = getattr(hf_config, "text_config", hf_config)
+
+        # Use base class helper for common config conversion
+        provider_kwargs = self.hf_config_to_provider_kwargs(text_config)
+        provider = Qwen2AudioModelProvider(**provider_kwargs)
+
+        # Qwen2-specific settings
+        provider.normalization = "RMSNorm"
+        provider.gated_linear_unit = True
+        provider.add_qkv_bias = True
+        provider.add_bias_linear = False
+        provider.hidden_dropout = 0.0
+
+        # Audio-specific settings
+        provider.hf_config = hf_config
+        provider.audio_token_id = getattr(hf_config, "audio_token_index", 151646)
+        provider.bos_token_id = getattr(hf_config, "bos_token_id", 151643)
+        provider.eos_token_id = getattr(hf_config, "eos_token_id", 151645)
+        provider.pad_token_id = getattr(hf_config, "pad_token_id", 151643)
+
+        return provider
+
+    def mapping_registry(self) -> MegatronMappingRegistry:
+        """
+        Return MegatronMappingRegistry containing parameter mappings for audio-language models.
+
+        HuggingFace weight structure:
+        - language_model.model.embed_tokens.weight
+        - language_model.model.layers.{i}.input_layernorm.weight
+        - language_model.model.layers.{i}.self_attn.{q,k,v,o}_proj.weight
+        - language_model.model.layers.{i}.post_attention_layernorm.weight
+        - language_model.model.layers.{i}.mlp.{gate,up,down}_proj.weight
+        - language_model.model.norm.weight
+        - language_model.lm_head.weight
+        - audio_tower.** (conv1, conv2, embed_positions, layers, layer_norm, avg_pooler)
+        - multi_modal_projector.linear.weight
+
+        Returns:
+            MegatronMappingRegistry with all parameter mappings
+        """
+        # Language model direct mappings
+        # Maps: Megatron param name -> HuggingFace param name
+        param_mappings = {
+            # Embeddings and output layers
+            "language_model.embedding.word_embeddings.weight": "language_model.model.embed_tokens.weight",
+            "language_model.output_layer.weight": "language_model.lm_head.weight",
+            "language_model.decoder.final_layernorm.weight": "language_model.model.norm.weight",
+            # Layer normalization for attention and MLP
+            "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "language_model.model.layers.*.input_layernorm.weight",
+            "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "language_model.model.layers.*.post_attention_layernorm.weight",
+            # Attention output projection
+            "language_model.decoder.layers.*.self_attention.linear_proj.weight": "language_model.model.layers.*.self_attn.o_proj.weight",
+            # MLP output projection
+            "language_model.decoder.layers.*.mlp.linear_fc2.weight": "language_model.model.layers.*.mlp.down_proj.weight",
+        }
+
+        mapping_list = []
+        # Convert each dictionary entry to AutoMapping(megatron_param, hf_param)
+        for megatron_param, hf_param in param_mappings.items():
+            mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param))
+
+        # Add special mappings that require parameter transformation
+        mapping_list.extend(
+            [
+                # Audio tower weights are replicated directly
+                # Includes: conv1, conv2, embed_positions, layers.*.self_attn.*, layers.*.fc1, layers.*.fc2, layer_norm, avg_pooler
+                ReplicatedMapping(
+                    megatron_param="audio_tower.**",
+                    hf_param="audio_tower.**",
+                ),
+                # Multimodal projector weights (linear layer)
+                ReplicatedMapping(
+                    megatron_param="multi_modal_projector.**",
+                    hf_param="multi_modal_projector.**",
+                ),
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    megatron_param="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
+                    q="language_model.model.layers.*.self_attn.q_proj.weight",
+                    k="language_model.model.layers.*.self_attn.k_proj.weight",
+                    v="language_model.model.layers.*.self_attn.v_proj.weight",
+                ),
+                # QKV bias: Combine separate Q, K, V biases into single QKV bias (Qwen2 specific)
+                QKVMapping(
+                    megatron_param="language_model.decoder.layers.*.self_attention.linear_qkv.bias",
+                    q="language_model.model.layers.*.self_attn.q_proj.bias",
+                    k="language_model.model.layers.*.self_attn.k_proj.bias",
+                    v="language_model.model.layers.*.self_attn.v_proj.bias",
+                ),
+                # Gated MLP: Combine gate and up projection matrices into single FC1 matrix
+                GatedMLPMapping(
+                    megatron_param="language_model.decoder.layers.*.mlp.linear_fc1.weight",
+                    gate="language_model.model.layers.*.mlp.gate_proj.weight",
+                    up="language_model.model.layers.*.mlp.up_proj.weight",
+                ),
+            ]
+        )
+
+        return MegatronMappingRegistry(*mapping_list)
diff --git a/src/megatron/bridge/models/qwen_audio/qwen2_audio_provider.py b/src/megatron/bridge/models/qwen_audio/qwen2_audio_provider.py
new file mode 100644
index 0000000000..b4eb069444
--- /dev/null
+++ b/src/megatron/bridge/models/qwen_audio/qwen2_audio_provider.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Qwen2-Audio Model Provider configurations for Megatron-Core.
+
+This module provides configuration classes for Qwen2-Audio models,
+compatible with HuggingFace's Qwen2-Audio model configurations.
+
+Reference: https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
+
+Qwen2-Audio Key Features:
+- Audio-language capabilities with separate language model and audio encoder
+- Whisper-like audio encoder for processing mel spectrograms
+- Based on Qwen2 language model architecture
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+from megatron.bridge.models.qwen.qwen_provider import Qwen2ModelProvider
+
+
+if TYPE_CHECKING:
+    from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel
+
+
+# =============================================================================
+# Qwen2-Audio Model Provider
+# =============================================================================
+
+
+@dataclass
+class Qwen2AudioModelProvider(Qwen2ModelProvider):
+    """
+    Base model provider for Qwen2-Audio Models.
+
+    Qwen2-Audio is a multimodal model combining a Whisper-like audio encoder
+    with a Qwen2 language model for audio understanding tasks.
+
+    Reference:
+    - https://huggingface.co/Qwen/Qwen2-Audio-7B
+    - https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct
+
+    Key Features:
+    - Audio encoder based on Whisper architecture
+    - Supports variable-length audio inputs via mel spectrograms
+    - Multi-turn conversation with audio context
+    """
+
+    # Audio-Language models shouldn't scatter embeddings across sequence parallel regions
+    # because audio embeddings are inserted into language embeddings
+    scatter_embedding_sequence_parallel: bool = False
+
+    # HuggingFace config containing audio_config and text_config
+    hf_config: Optional[Any] = None
+
+    # Audio-specific token IDs (defaults from Qwen2-Audio)
+    audio_token_id: int = 151646  # <|AUDIO|> token
+
+    # Token IDs
+    bos_token_id: int = 151643
+    eos_token_id: int = 151645
+    pad_token_id: int = 151643
+
+    # Freeze options for fine-tuning
+    freeze_language_model: bool = False
+    freeze_audio_model: bool = False
+    freeze_audio_projection: bool = False
+
+    def provide(self, pre_process=None, post_process=None, vp_stage=None) -> "Qwen2AudioModel":
+        """
+        Provide a Qwen2AudioModel instance with audio and language components.
+
+        Args:
+            pre_process: Whether this is the first stage in pipeline parallelism
+            post_process: Whether this is the last stage in pipeline parallelism
+            vp_stage: Virtual pipeline stage number
+
+        Returns:
+            Qwen2AudioModel instance with HF audio encoder and Megatron language model
+        """
+        from megatron.bridge.models.qwen_audio.modeling_qwen2_audio import Qwen2AudioModel
+
+        model = Qwen2AudioModel(
+            config=self,
+            pre_process=pre_process,
+            post_process=post_process,
+            vp_stage=vp_stage,
+        )
+
+        # Apply freeze options if any are enabled for fine-tuning
+        if self.freeze_language_model or self.freeze_audio_model or self.freeze_audio_projection:
+            model.freeze(
+                freeze_language_model=self.freeze_language_model,
+                freeze_audio_model=self.freeze_audio_model,
+                freeze_audio_projection=self.freeze_audio_projection,
+            )
+
+        return model
+
+    def provide_language_model(self, pre_process=None, post_process=None, vp_stage=None) -> MCoreGPTModel:
+        """
+        Provide just the language model component without audio.
+
+        Args:
+            pre_process: Whether this is the first stage in pipeline parallelism
+            post_process: Whether this is the last stage in pipeline parallelism
+            vp_stage: Virtual pipeline stage number
+
+        Returns:
+            MCoreGPTModel instance (language model only)
+        """
+        return GPTModelProvider.provide(self, pre_process=pre_process, post_process=post_process, vp_stage=vp_stage)
diff --git a/tests/functional_tests/L2_Launch_models_qwen_audio.sh b/tests/functional_tests/L2_Launch_models_qwen_audio.sh
new file mode 100755
index 0000000000..73226fec36
--- /dev/null
+++ b/tests/functional_tests/L2_Launch_models_qwen_audio.sh
@@ -0,0 +1,23 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+uv run coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest \
+  -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
+  tests/functional_tests/models/qwen_audio
+coverage combine -q
diff --git a/tests/functional_tests/models/qwen_audio/__init__.py b/tests/functional_tests/models/qwen_audio/__init__.py
new file mode 100644
index 0000000000..341a77c5bc
--- /dev/null
+++ b/tests/functional_tests/models/qwen_audio/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py b/tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py
new file mode 100644
index 0000000000..5ac918cd5d
--- /dev/null
+++ b/tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Functional tests for Qwen2 Audio HF to Megatron generation.
+
+Example run commands:
+    # Run all generation tests
+    pytest tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py
+
+    # Run specific test
+    pytest tests/functional_tests/models/qwen_audio/test_qwen2_audio_generation.py::TestQwen2AudioGeneration::test_qwen2_audio_generation
+
+Note: These tests use small proxy/toy models for fast generation testing.
+"""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+from transformers import AutoTokenizer, Qwen2AudioConfig, Qwen2AudioForConditionalGeneration
+
+
+HF_QWEN2_AUDIO_TOY_MODEL_CONFIG = {
+    "architectures": ["Qwen2AudioForConditionalGeneration"],
+    "audio_token_index": 151646,
+    "model_type": "qwen2_audio",
+    "audio_config": {
+        "model_type": "qwen2_audio_encoder",
+        "num_mel_bins": 128,
+        "d_model": 256,
+        "encoder_layers": 4,
+        "encoder_attention_heads": 4,
+        "encoder_ffn_dim": 512,
+        "dropout": 0.0,
+        "attention_dropout": 0.0,
+        "activation_function": "gelu",
+        "activation_dropout": 0.0,
+        "encoder_layerdrop": 0.0,
+        "num_hidden_layers": 4,
+        "initializer_range": 0.02,
+        "scale_embedding": False,
+        "max_source_positions": 1500,
+    },
+    "text_config": {
+        "vocab_size": 151936,
+        "max_position_embeddings": 32768,
+        "hidden_size": 256,
+        "intermediate_size": 512,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 4,
+        "hidden_act": "silu",
+        "initializer_range": 0.02,
+        "rms_norm_eps": 1e-06,
+        "use_cache": True,
+        "rope_theta": 10000.0,
+        "attention_dropout": 0.0,
+        "tie_word_embeddings": False,
+    },
+}
+
+
+class TestQwen2AudioGeneration:
+    """
+    Test Qwen2 Audio model generation using HF to Megatron conversion with audio inputs.
+    Uses small proxy/toy models for fast generation testing.
+    """
+
+    @pytest.fixture(scope="class")
+    def qwen2_audio_toy_model_path(self, tmp_path_factory):
+        """
+        Create and save a HuggingFace Qwen2 Audio toy model to a temporary directory.
+
+        Args:
+            tmp_path_factory: Pytest temporary path factory for class-scoped fixtures
+
+        Returns:
+            str: Path to the saved HuggingFace model directory
+        """
+        # Create a temporary directory for this test class
+        temp_dir = tmp_path_factory.mktemp("qwen2_audio_generation_toy_model")
+        model_dir = temp_dir / "qwen2_audio_toy"
+
+        # Create Qwen2 Audio config from the toy model config
+        config = Qwen2AudioConfig(**HF_QWEN2_AUDIO_TOY_MODEL_CONFIG)
+        config.torch_dtype = torch.bfloat16
+
+        # Create model with random weights and convert to bfloat16
+        model = Qwen2AudioForConditionalGeneration(config)
+        model = model.to(dtype=torch.bfloat16)
+
+        # Download and save tokenizer and processor from a reference Qwen2 Audio model
+        try:
+            from transformers import AutoProcessor
+
+            tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
+            tokenizer.save_pretrained(model_dir)
+
+            # Also save the processor
+            processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
+            processor.save_pretrained(model_dir)
+        except Exception as e:
+            print(f"Warning: Could not download tokenizer/processor, creating minimal files: {e}")
+            # Create minimal tokenizer files if download fails
+            tokenizer_config = {
+                "tokenizer_class": "Qwen2Tokenizer",
+                "vocab_size": 151936,
+                "bos_token": "<|endoftext|>",
+                "eos_token": "<|endoftext|>",
+                "pad_token": "<|endoftext|>",
+                "unk_token": "<|endoftext|>",
+            }
+            with open(model_dir / "tokenizer_config.json", "w") as f:
+                json.dump(tokenizer_config, f, indent=2)
+
+        # Save model and config to directory
+        model.save_pretrained(model_dir, safe_serialization=True)
+
+        # Also save config.json explicitly
+        config_path = model_dir / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(HF_QWEN2_AUDIO_TOY_MODEL_CONFIG, f, indent=2)
+
+        print(f"Created toy model at: {model_dir}")
+        return str(model_dir)
+
+    @pytest.mark.run_only_on("GPU")
+    def test_qwen2_audio_generation(self, qwen2_audio_toy_model_path):
+        """
+        Test Qwen2 Audio toy model with audio generation.
+        Uses a small proxy model instead of the full 7B model for fast testing.
+        Uses real audio to test audio-language pipeline.
+
+        Args:
+            qwen2_audio_toy_model_path: Path to the toy Qwen2 Audio model (from fixture)
+        """
+        cmd = [
+            sys.executable,
+            "-m",
+            "torch.distributed.run",
+            "--nproc_per_node=2",
+            "examples/conversion/hf_to_megatron_generate_audio_lm.py",
+            f"--hf_model_path={qwen2_audio_toy_model_path}",
+            "--audio_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+            "--prompt=What's that sound?",
+            "--tp=2",
+            "--max_new_tokens=50",
+        ]
+
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                cwd=Path(__file__).parent.parent.parent.parent.parent,
+            )
+
+            # Print output for debugging
+            print("\n" + "=" * 80)
+            print("STDOUT:")
+            print(result.stdout)
+            print("\n" + "=" * 80)
+            print("STDERR:")
+            print(result.stderr)
+            print("=" * 80 + "\n")
+
+            if result.returncode != 0:
+                assert False, f"Qwen2-Audio toy model generation failed with return code {result.returncode}"
+
+            print("SUCCESS: Qwen2-Audio toy model generation test completed successfully")
+
+        except subprocess.TimeoutExpired:
+            assert False, "Qwen2-Audio toy model generation test timed out after 5 minutes"
+        except Exception as e:
+            print(f"Error during Qwen2-Audio toy model generation test: {e}")
+            raise
diff --git a/tests/unit_tests/models/qwen_audio/__init__.py b/tests/unit_tests/models/qwen_audio/__init__.py
new file mode 100644
index 0000000000..2f4cc5bc8c
--- /dev/null
+++ b/tests/unit_tests/models/qwen_audio/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for Qwen Audio models."""
diff --git a/tests/unit_tests/models/qwen_audio/test_qwen2_audio_bridge.py b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_bridge.py
new file mode 100644
index 0000000000..fc255479d2
--- /dev/null
+++ b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_bridge.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import SimpleNamespace
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+
+from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
+from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM
+from megatron.bridge.models.qwen_audio.qwen2_audio_bridge import Qwen2AudioBridge
+from megatron.bridge.models.qwen_audio.qwen2_audio_provider import Qwen2AudioModelProvider
+
+
+@pytest.fixture
+def mock_text_config():
+    """Create a mock text config for Qwen2-Audio."""
+    text_config = SimpleNamespace(
+        num_hidden_layers=32,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        vocab_size=151936,
+        max_position_embeddings=4096,
+        rope_theta=1000000.0,
+        tie_word_embeddings=False,
+        hidden_act="silu",
+        rope_scaling=None,
+        torch_dtype=torch.bfloat16,
+        bos_token_id=151643,
+        eos_token_id=151645,
+    )
+    return text_config
+
+
+@pytest.fixture
+def mock_audio_config():
+    """Create a mock audio encoder config for Qwen2-Audio."""
+    audio_config = Mock()
+    audio_config.d_model = 1280
+    audio_config.encoder_layers = 32
+    audio_config.encoder_attention_heads = 20
+    audio_config.encoder_ffn_dim = 5120
+    return audio_config
+
+
+@pytest.fixture
+def mock_hf_config(mock_text_config, mock_audio_config):
+    """Create a mock HF config for Qwen2-Audio."""
+    config = Mock()
+    config.text_config = mock_text_config
+    config.audio_config = mock_audio_config
+    config.tie_word_embeddings = False
+    config.audio_token_index = 151646
+    config.bos_token_id = 151643
+    config.eos_token_id = 151645
+    config.pad_token_id = 151643
+    return config
+
+
+@pytest.fixture
+def mock_hf_pretrained(mock_hf_config):
+    """Create a mock HF pretrained VLM."""
+    pretrained = Mock(spec=PreTrainedVLM)
+    pretrained.config = mock_hf_config
+    return pretrained
+
+
+@pytest.fixture
+def qwen2_audio_bridge():
+    """Create a Qwen2AudioBridge instance."""
+    return Qwen2AudioBridge()
+
+
+class TestQwen2AudioBridgeInitialization:
+    """Test Qwen2AudioBridge initialization and basic functionality."""
+
+    def test_bridge_initialization(self, qwen2_audio_bridge):
+        """Test that bridge can be initialized."""
+        assert isinstance(qwen2_audio_bridge, Qwen2AudioBridge)
+
+    def test_bridge_has_required_methods(self, qwen2_audio_bridge):
+        """Test that bridge has required methods."""
+        assert hasattr(qwen2_audio_bridge, "provider_bridge")
+        assert callable(qwen2_audio_bridge.provider_bridge)
+
+        assert hasattr(qwen2_audio_bridge, "mapping_registry")
+        assert callable(qwen2_audio_bridge.mapping_registry)
+
+
+class TestQwen2AudioBridgeProviderBridge:
+    """Test provider_bridge method functionality."""
+
+    def test_provider_bridge_basic_config(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge creates correct provider with basic config."""
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert isinstance(provider, Qwen2AudioModelProvider)
+
+        # Check basic transformer config
+        assert provider.num_layers == 32
+        assert provider.hidden_size == 4096
+        assert provider.ffn_hidden_size == 11008
+        assert provider.num_attention_heads == 32
+        assert provider.num_query_groups == 32
+        assert provider.init_method_std == 0.02
+        assert provider.layernorm_epsilon == 1e-6
+        assert provider.vocab_size == 151936
+        assert provider.seq_length == 4096
+        assert provider.rotary_base == 1000000.0
+        assert provider.share_embeddings_and_output_weights is False
+
+    def test_provider_bridge_audio_specific_config(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge creates correct audio-specific configuration."""
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        # Check audio-specific token IDs
+        assert provider.audio_token_id == 151646
+        assert provider.bos_token_id == 151643
+        assert provider.eos_token_id == 151645
+        assert provider.pad_token_id == 151643
+
+        # Check hf_config is propagated
+        assert provider.hf_config is mock_hf_pretrained.config
+
+        # Check Qwen2-specific settings
+        assert provider.add_qkv_bias is True
+
+    def test_provider_bridge_qwen2_settings(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge sets Qwen2-specific settings correctly."""
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert provider.normalization == "RMSNorm"
+        assert provider.gated_linear_unit is True
+        assert provider.add_qkv_bias is True
+        assert provider.add_bias_linear is False
+        assert provider.hidden_dropout == 0.0
+
+    def test_provider_bridge_with_custom_token_ids(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge with custom token IDs from config."""
+        mock_hf_pretrained.config.audio_token_index = 200000
+        mock_hf_pretrained.config.bos_token_id = 200001
+        mock_hf_pretrained.config.eos_token_id = 200002
+        mock_hf_pretrained.config.pad_token_id = 200003
+
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert provider.audio_token_id == 200000
+        assert provider.bos_token_id == 200001
+        assert provider.eos_token_id == 200002
+        assert provider.pad_token_id == 200003
+
+    def test_provider_bridge_with_tied_embeddings(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge with tied embeddings."""
+        mock_hf_pretrained.config.text_config.tie_word_embeddings = True
+
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert provider.share_embeddings_and_output_weights is True
+
+    @patch.object(Qwen2AudioBridge, "dtype_from_hf")
+    def test_provider_bridge_dtype_handling_fp16(self, mock_dtype_from_hf, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge handles fp16 dtype correctly."""
+        mock_dtype_from_hf.return_value = torch.float16
+
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert provider.fp16 is True
+        assert provider.bf16 is False
+        assert provider.params_dtype == torch.float16
+
+    @patch.object(Qwen2AudioBridge, "dtype_from_hf")
+    def test_provider_bridge_dtype_handling_bf16(self, mock_dtype_from_hf, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge handles bfloat16 dtype correctly."""
+        mock_dtype_from_hf.return_value = torch.bfloat16
+
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert provider.fp16 is False
+        assert provider.bf16 is True
+        assert provider.params_dtype == torch.bfloat16
+
+    @patch.object(Qwen2AudioBridge, "make_vocab_size_divisible_by")
+    def test_provider_bridge_vocab_size_divisibility(self, mock_divisible, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge handles vocab size divisibility."""
+        mock_divisible.return_value = 128
+
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        mock_divisible.assert_called_once_with(151936)
+        assert provider.make_vocab_size_divisible_by == 128
+
+
+class TestQwen2AudioBridgeMappingRegistry:
+    """Test mapping_registry method functionality."""
+
+    def _get_mapping_names(self, registry):
+        """Helper to extract all mapping param names from a registry."""
+        mapping_names = []
+        for mapping in registry.mappings:
+            if hasattr(mapping, "megatron_param"):
+                mapping_names.append(str(getattr(mapping, "megatron_param")))
+            hf = getattr(mapping, "hf_param", None)
+            if isinstance(hf, dict):
+                mapping_names.extend([str(v) for v in hf.values()])
+            elif isinstance(hf, str):
+                mapping_names.append(hf)
+        return mapping_names
+
+    def test_mapping_registry_returns_correct_type(self, qwen2_audio_bridge):
+        """Test mapping_registry returns MegatronMappingRegistry."""
+        registry = qwen2_audio_bridge.mapping_registry()
+
+        assert isinstance(registry, MegatronMappingRegistry)
+
+    def test_mapping_registry_contains_embeddings(self, qwen2_audio_bridge):
+        """Test mapping_registry contains word embeddings mapping."""
+        registry = qwen2_audio_bridge.mapping_registry()
+        mapping_names = self._get_mapping_names(registry)
+
+        has_embeddings = any("embed_tokens" in name or "word_embeddings" in name for name in mapping_names)
+        assert has_embeddings, "Should contain embeddings mapping"
+
+    def test_mapping_registry_contains_audio_tower(self, qwen2_audio_bridge):
+        """Test mapping_registry contains audio_tower mapping."""
+        registry = qwen2_audio_bridge.mapping_registry()
+        mapping_names = self._get_mapping_names(registry)
+
+        has_audio_tower = any("audio_tower" in name for name in mapping_names)
+        assert has_audio_tower, "Should contain audio_tower mapping"
+
+    def test_mapping_registry_contains_projector(self, qwen2_audio_bridge):
+        """Test mapping_registry contains multi_modal_projector mapping."""
+        registry = qwen2_audio_bridge.mapping_registry()
+        mapping_names = self._get_mapping_names(registry)
+
+        has_projector = any("multi_modal_projector" in name for name in mapping_names)
+        assert has_projector, "Should contain multi_modal_projector mapping"
+
+    def test_mapping_registry_contains_qkv(self, qwen2_audio_bridge):
+        """Test mapping_registry contains QKV parameter mappings."""
+        registry = qwen2_audio_bridge.mapping_registry()
+        mapping_names = self._get_mapping_names(registry)
+
+        has_qkv = any("linear_qkv" in name for name in mapping_names)
+        assert has_qkv, "Should contain QKV mappings"
+
+    def test_mapping_registry_contains_mlp(self, qwen2_audio_bridge):
+        """Test mapping_registry contains MLP parameter mappings."""
+        registry = qwen2_audio_bridge.mapping_registry()
+        mapping_names = self._get_mapping_names(registry)
+
+        has_mlp = any("mlp" in name for name in mapping_names)
+        assert has_mlp, "Should contain MLP mappings"
+
+
+class TestQwen2AudioBridgeEdgeCases:
+    """Test edge cases and error conditions."""
+
+    def test_provider_bridge_with_minimal_config(self, qwen2_audio_bridge):
+        """Test provider_bridge with minimal HF config."""
+        minimal_pretrained = Mock(spec=PreTrainedVLM)
+        minimal_config = Mock()
+
+        text_config = SimpleNamespace(
+            num_hidden_layers=24,
+            hidden_size=2048,
+            intermediate_size=5504,
+            num_attention_heads=16,
+            num_key_value_heads=16,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            vocab_size=151936,
+            max_position_embeddings=4096,
+            rope_theta=1000000.0,
+            hidden_act="silu",
+            tie_word_embeddings=False,
+            rope_scaling=None,
+            torch_dtype=torch.bfloat16,
+        )
+
+        minimal_config.text_config = text_config
+        minimal_config.tie_word_embeddings = False
+        minimal_config.audio_token_index = 151646
+        minimal_config.bos_token_id = 151643
+        minimal_config.eos_token_id = 151645
+        minimal_config.pad_token_id = 151643
+        minimal_pretrained.config = minimal_config
+
+        provider = qwen2_audio_bridge.provider_bridge(minimal_pretrained)
+
+        assert isinstance(provider, Qwen2AudioModelProvider)
+        assert provider.num_layers == 24
+        assert provider.hidden_size == 2048
+
+    def test_provider_bridge_with_different_vocab_sizes(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge with different vocabulary sizes."""
+        test_vocab_sizes = [32000, 151936, 152064]
+
+        for vocab_size in test_vocab_sizes:
+            mock_hf_pretrained.config.text_config.vocab_size = vocab_size
+            provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+            assert provider.vocab_size == vocab_size
+
+    def test_provider_bridge_with_different_sequence_lengths(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge with different sequence lengths."""
+        test_seq_lengths = [2048, 4096, 8192, 32768]
+
+        for seq_length in test_seq_lengths:
+            mock_hf_pretrained.config.text_config.max_position_embeddings = seq_length
+            provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+            assert provider.seq_length == seq_length
+
+
+class TestQwen2AudioBridgeCompatibility:
+    """Test compatibility with different HF model configurations."""
+
+    def test_provider_bridge_with_group_query_attention(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge with group query attention."""
+        mock_hf_pretrained.config.text_config.num_attention_heads = 32
+        mock_hf_pretrained.config.text_config.num_key_value_heads = 8
+
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert provider.num_attention_heads == 32
+        assert provider.num_query_groups == 8
+
+    def test_provider_bridge_with_different_rope_theta(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge with different RoPE theta values."""
+        test_rope_values = [10000.0, 500000.0, 1000000.0]
+
+        for rope_theta in test_rope_values:
+            mock_hf_pretrained.config.text_config.rope_theta = rope_theta
+            provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+            assert provider.rotary_base == rope_theta
+
+    def test_provider_bridge_with_missing_audio_token_index(self, qwen2_audio_bridge, mock_hf_pretrained):
+        """Test provider_bridge with missing audio_token_index uses default."""
+        delattr(mock_hf_pretrained.config, "audio_token_index")
+
+        provider = qwen2_audio_bridge.provider_bridge(mock_hf_pretrained)
+
+        assert provider.audio_token_id == 151646
diff --git a/tests/unit_tests/models/qwen_audio/test_qwen2_audio_provider.py b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_provider.py
new file mode 100644
index 0000000000..4b53cb3212
--- /dev/null
+++ b/tests/unit_tests/models/qwen_audio/test_qwen2_audio_provider.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.bridge.models.qwen_audio import Qwen2AudioModelProvider
+
+
+class TestQwen2AudioModelProvider:
+    """Test cases for Qwen2AudioModelProvider class."""
+
+    def test_initialization(self):
+        """Test Qwen2AudioModelProvider can be initialized with default values."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+        )
+
+        assert provider.num_layers == 32
+        assert provider.hidden_size == 4096
+        assert provider.num_attention_heads == 32
+
+    def test_audio_specific_defaults(self):
+        """Test Qwen2AudioModelProvider audio-specific default configuration."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+        )
+
+        # Audio-language models shouldn't scatter embeddings
+        assert provider.scatter_embedding_sequence_parallel is False
+
+        # HF config defaults to None
+        assert provider.hf_config is None
+
+        # Audio-specific token ID
+        assert provider.audio_token_id == 151646
+
+        # Token IDs
+        assert provider.bos_token_id == 151643
+        assert provider.eos_token_id == 151645
+        assert provider.pad_token_id == 151643
+
+        # Freeze options defaults
+        assert provider.freeze_language_model is False
+        assert provider.freeze_audio_model is False
+        assert provider.freeze_audio_projection is False
+
+    def test_custom_token_ids(self):
+        """Test Qwen2AudioModelProvider with custom token IDs."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+            audio_token_id=200,
+            bos_token_id=201,
+            eos_token_id=202,
+            pad_token_id=203,
+        )
+
+        assert provider.audio_token_id == 200
+        assert provider.bos_token_id == 201
+        assert provider.eos_token_id == 202
+        assert provider.pad_token_id == 203
+
+    def test_freeze_options(self):
+        """Test Qwen2AudioModelProvider with freeze options."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+            freeze_language_model=True,
+            freeze_audio_model=True,
+            freeze_audio_projection=True,
+        )
+
+        assert provider.freeze_language_model is True
+        assert provider.freeze_audio_model is True
+        assert provider.freeze_audio_projection is True
+
+    def test_custom_hf_config(self):
+        """Test Qwen2AudioModelProvider with custom hf_config."""
+        dummy_config = {"text_config": {}, "audio_config": {}}
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+            hf_config=dummy_config,
+        )
+
+        assert provider.hf_config is dummy_config
+
+    def test_provide_method_exists(self):
+        """Test that provide method exists and is callable."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+        )
+
+        assert hasattr(provider, "provide")
+        assert callable(provider.provide)
+
+    def test_provide_language_model_method_exists(self):
+        """Test that provide_language_model method exists and is callable."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+        )
+
+        assert hasattr(provider, "provide_language_model")
+        assert callable(provider.provide_language_model)
+
+    def test_inherit_from_qwen2_provider(self):
+        """Test that Qwen2AudioModelProvider inherits Qwen2 configurations correctly."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+            seq_length=8192,
+            vocab_size=152064,
+            rotary_base=500000.0,
+        )
+
+        # Check that inherited configurations work
+        assert provider.seq_length == 8192
+        assert provider.vocab_size == 152064
+        assert provider.rotary_base == 500000.0
+
+        # Qwen2 defaults should be inherited
+        assert provider.normalization == "RMSNorm"
+        assert provider.gated_linear_unit is True
+        assert provider.add_qkv_bias is True
+        assert provider.add_bias_linear is False
+
+        # Audio-specific overrides should still work
+        assert provider.scatter_embedding_sequence_parallel is False
+
+    def test_edge_cases(self):
+        """Test edge cases and boundary conditions."""
+        # Test with minimal valid configuration
+        provider = Qwen2AudioModelProvider(
+            num_layers=1,
+            hidden_size=64,
+            num_attention_heads=1,
+        )
+
+        assert provider.num_layers == 1
+        assert provider.hidden_size == 64
+        assert provider.num_attention_heads == 1
+        assert provider.scatter_embedding_sequence_parallel is False
+
+        # Test with large configuration
+        provider_large = Qwen2AudioModelProvider(
+            num_layers=80,
+            hidden_size=8192,
+            num_attention_heads=64,
+            num_query_groups=8,
+        )
+
+        assert provider_large.num_layers == 80
+        assert provider_large.hidden_size == 8192
+        assert provider_large.num_attention_heads == 64
+        assert provider_large.num_query_groups == 8
+
+
+class TestQwen2AudioModelProviderInheritance:
+    """Test inheritance relationships for Qwen2AudioModelProvider."""
+
+    def test_inherits_from_gpt_provider(self):
+        """Test that Qwen2AudioModelProvider inherits from GPTModelProvider."""
+        from megatron.bridge.models.gpt_provider import GPTModelProvider
+
+        assert issubclass(Qwen2AudioModelProvider, GPTModelProvider)
+
+    def test_inherits_from_qwen2_provider(self):
+        """Test that Qwen2AudioModelProvider inherits from Qwen2ModelProvider."""
+        from megatron.bridge.models.qwen.qwen_provider import Qwen2ModelProvider
+
+        assert issubclass(Qwen2AudioModelProvider, Qwen2ModelProvider)
+
+    def test_provider_method_inheritance(self):
+        """Test that inherited methods work correctly."""
+        provider = Qwen2AudioModelProvider(
+            num_layers=32,
+            hidden_size=4096,
+            num_attention_heads=32,
+        )
+
+        # Should inherit all Qwen2ModelProvider methods
+        assert hasattr(provider, "provide")
+        assert hasattr(provider, "provide_language_model")
+
+        # Audio-specific fields should also exist
+        assert hasattr(provider, "freeze_language_model")
+        assert hasattr(provider, "freeze_audio_model")
+        assert hasattr(provider, "freeze_audio_projection")