NVIDIA-NeMo · cuichenx · Feb 28, 2026 · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026
@@ -363,6 +363,7 @@ jobs:
           - script: L2_Launch_models_qwen
           - script: L2_Launch_models_qwen_quantization
           - script: L2_Launch_models_qwen_vl
+          - script: L2_Launch_models_qwen35_vl
           - script: L2_Launch_recipes_gemma_vl
           - script: L2_Launch_recipes_gpt_oss
           - script: L2_Launch_models_qwen_vl_quantization

diff --git a/examples/conversion/compare_hf_and_megatron/compare.py b/examples/conversion/compare_hf_and_megatron/compare.py
@@ -91,6 +91,7 @@
 """
 
 import argparse
+import gc
 import importlib
 import os
 import sys
@@ -318,7 +319,13 @@ def vlm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor:
     def loss_func(x, **kwargs):
         return x
 
-    return model(**forward_args), loss_func
+    model_output = model(**forward_args)
+    if isinstance(model_output, tuple):
+        output_tensor, _ = model_output
+    else:
+        output_tensor = model_output
+
+    return output_tensor, loss_func
 
 
 def load_image(image_path: str) -> Image.Image:
@@ -609,6 +616,11 @@ def _load_megatron_model(args):
         model_provider.finalize()
         megatron_model = model_provider.provide_distributed_model(wrap_with_ddp=False)
 
+    # Workaround: disable MTP for inference (causes hangs on NCCL collectives)
+    for m in megatron_model:
+        m.config.mtp_num_layers = None
+        m.config.grad_scale_func = None
-    # Workaround: disable MTP for inference (causes hangs on NCCL collectives)
-    for m in megatron_model:
-        m.config.mtp_num_layers = None
-        m.config.grad_scale_func = None
+    # Workaround: disable MTP for inference (causes hangs on NCCL collectives)
+    for m in megatron_model:
+        if hasattr(m.config, "mtp_num_layers"):
+            m.config.mtp_num_layers = None
+        if hasattr(m.config, "grad_scale_func"):
+            m.config.grad_scale_func = None
-    # Workaround: disable MTP for inference (causes hangs on NCCL collectives)
-    for m in megatron_model:
-        m.config.mtp_num_layers = None
-        m.config.grad_scale_func = None
+    # Workaround: disable MTP for inference (causes hangs on NCCL collectives)
+    for m in megatron_model:
+        if hasattr(m.config, "mtp_num_layers"):
+            m.config.mtp_num_layers = None
+        if hasattr(m.config, "grad_scale_func"):
+            m.config.grad_scale_func = None
+
     model_components = [m.eval() for m in megatron_model]
 
     # Register debug hooks if enabled
@@ -715,11 +727,10 @@ def compare_models_one_step(args) -> None:
     )
 
     del hf_model
-    # Reload Megatron model to ensure a fresh instance before comparison
-    megatron_model, _ = _load_megatron_model(args)
+    gc.collect()
+    torch.cuda.empty_cache()
 
-    # Broadcast HF results to all ranks after Megatron initialization
-    # (following the pattern from generate_from_hf.py)
+    # Broadcast HF results to all ranks
     if torch.distributed.is_initialized():
         # Create tensors for broadcasting if they don't exist on non-rank-0
         if hf_next_token is None:
@@ -731,6 +742,9 @@ def compare_models_one_step(args) -> None:
             )
             hf_logits = torch.zeros(vocab_size, device=input_ids.device, dtype=torch.float32)
 
+        # Ensure consistent dtype across ranks before broadcast
+        hf_logits = hf_logits.float()
+
         # Broadcast from rank 0 to all ranks
         torch.distributed.broadcast(hf_next_token, 0)
         torch.distributed.broadcast(hf_logits, 0)
@@ -778,7 +792,10 @@ def compare_models_one_step(args) -> None:
             megatron_logits = megatron_output[0, -1, :]
             megatron_next_token = torch.argmax(megatron_logits, dim=-1)
 
-            if not torch.distributed.is_initialized() or parallel_state.get_tensor_model_parallel_rank() == 0:
+            if not torch.distributed.is_initialized() or (
+                parallel_state.get_tensor_model_parallel_rank() == 0
+                and parallel_state.get_expert_model_parallel_rank() == 0
+            ):
                 print(f"Megatron output shape: {megatron_output.shape}")
                 print(f"Megatron logits stats - mean: {megatron_logits.mean():.4f}, std: {megatron_logits.std():.4f}")
                 print(

diff --git a/examples/conversion/hf_megatron_roundtrip_multi_gpu.py b/examples/conversion/hf_megatron_roundtrip_multi_gpu.py
@@ -62,6 +62,8 @@
 # These are compared in float32 to avoid false mismatches.
 IGNORE_PRECISION_PARAMS = [
     "e_score_correction_bias",
+    "A_log",
+    "linear_attn.norm.weight",
 ]
 
 

diff --git a/examples/models/vlm/qwen35_vl/conversion.sh b/examples/models/vlm/qwen35_vl/conversion.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+MODEL_NAME=Qwen3.5-35B-A3B # Qwen3.5-35B-A3B, Qwen3.5-122B-A10B, Qwen3.5-397B-A17B, Qwen3.5-27B
+
+if [ "${MODEL_NAME}" = "Qwen3.5-27B" ]; then
+    HF_MODEL_CLASS="Qwen3_5ForConditionalGeneration"
+else
+    HF_MODEL_CLASS="Qwen3_5MoeForConditionalGeneration"
+fi
+
+# Make sure to upgrade to transformers >= 5.2.0
+# uv add transformers>=5.2.0
+
+# Import HF → Megatron
+uv run python examples/conversion/convert_checkpoints.py import \
+    --hf-model Qwen/${MODEL_NAME} \
+    --megatron-path ${WORKSPACE}/${MODEL_NAME} \
+    --torch-dtype bfloat16
+
+# HF and Megatron models logits comparison validation
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/compare_hf_and_megatron/compare.py \
+    --hf_model_path Qwen/${MODEL_NAME} \
+    --megatron_model_path ${WORKSPACE}/${MODEL_NAME} \
+    --model_class "${HF_MODEL_CLASS}" \
+    --image_path "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" \
+    --prompt "Describe this image." \
+    --tp 1 --pp 1 --ep 8
+
+# Export Megatron → HF
+uv run python examples/conversion/convert_checkpoints.py export \
+    --hf-model Qwen/${MODEL_NAME} \
+    --megatron-path ${WORKSPACE}/${MODEL_NAME}/iter_0000000 \
+    --hf-path ${WORKSPACE}/${MODEL_NAME}-hf-export
+
+# Round-trip validation
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+      --hf-model-id Qwen/${MODEL_NAME} --tp 1 --pp 2 --ep 4 --trust-remote-code
diff --git a/examples/models/vlm/qwen35_vl/inference.sh b/examples/models/vlm/qwen35_vl/inference.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+MODEL_NAME=Qwen3.5-35B-A3B  # Qwen3.5-35B-A3B, Qwen3.5-122B-A10B, Qwen3.5-27B
+
+# Inference with Hugging Face checkpoints
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \
+    --hf_model_path Qwen/${MODEL_NAME} \
+    --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \
+    --prompt "Describe this image." \
+    --max_new_tokens 50 \
+    --tp 2 --pp 2 --ep 4
+
+# Inference with imported Megatron checkpoints
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \
+    --hf_model_path Qwen/${MODEL_NAME} \
+    --megatron_model_path ${WORKSPACE}/${MODEL_NAME}/iter_0000000 \
+    --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \
+    --prompt "Describe this image." \
+    --max_new_tokens 50 \
+    --tp 2 --pp 2 --ep 4
+
+# Inference with exported HF checkpoints
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \
+    --hf_model_path ${WORKSPACE}/${MODEL_NAME}-hf-export \
+    --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \
+    --prompt "Describe this image." \
+    --max_new_tokens 50 \
+    --tp 2 --pp 2 --ep 4
diff --git a/examples/models/vlm/qwen35_vl/slurm_inference.sh b/examples/models/vlm/qwen35_vl/slurm_inference.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ==============================================================================
+# Qwen3.5-VL Multi-Node Distributed Inference for Qwen3.5-397B-A17B 
+# Recommended: TP=2, PP=4, EP=8 for full model (32 GPUs, 4 nodes)
+#
+# Usage:
+#   1. Modify the #SBATCH directives below for your cluster
+#   2. Set MODEL_PATH and CHECKPOINT_PATH as needed
+#   3. Set CONTAINER_IMAGE or use --no-container-image for bare metal
+#   4. Submit: sbatch slurm_inference.sh
+# ==============================================================================
+
+#SBATCH --job-name=qwen35v-inference
+#SBATCH --nodes=4                   # Number of nodes (32 GPUs = 4 nodes × 8 GPUs)
+#SBATCH --ntasks-per-node=8          # Tasks per node (1 per GPU)
+#SBATCH --gpus-per-node=8            # GPUs per node
+#SBATCH --time=02:00:00              # Max run time (2 hours)
+#SBATCH --partition=gpu              # Partition name
+#SBATCH --account=my_account         # Account name
+#SBATCH --output=logs/qwen35v_inference_%j.out
+#SBATCH --error=logs/qwen35v_inference_%j.err
+#SBATCH --exclusive                  # Exclusive node access
+
+# ==============================================================================
+# CONFIGURATION
+# ==============================================================================
+
+# Workspace directory
+WORKSPACE=${WORKSPACE:-/workspace}
+
+# Model configuration
+MODEL_NAME=Qwen3.5-397B-A17B
+
+# Option 1: Use HuggingFace model path (will load and convert on-the-fly)
+MODEL_PATH=${WORKSPACE}/${MODEL_NAME}
+# MODEL_PATH=Qwen/${MODEL_NAME}  # Or use HF Hub path
+
+# Option 2: Use pre-converted Megatron checkpoint (faster)
+MEGATRON_CHECKPOINT=${WORKSPACE}/${MODEL_NAME}/iter_0000000
+# Comment out to use HF model directly
+
+# Inference configuration
+IMAGE_PATH="https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png"
+PROMPT="Describe this image."
+MAX_NEW_TOKENS=1000
+
+# Parallelism configuration for 32 GPUs (4 nodes × 8 GPUs)
+TP=2      # Tensor Parallelism
+PP=4      # Pipeline Parallelism
+EP=8     # Expert Parallelism (MoE)
+
+# Container configuration (required for SLURM pyxis)
+CONTAINER_IMAGE=""
+# CONTAINER_IMAGE="/path/to/nemo-framework.sqsh"
+
+# Container mounts (optional, space-separated)
+CONTAINER_MOUNTS=""
+# CONTAINER_MOUNTS="/data:/data /workspace:/workspace"
+
+# Set to true to run without container (bare metal)
+NO_CONTAINER=false
+
+# ==============================================================================
+# Environment Setup
+# ==============================================================================
+
+# NCCL optimizations
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_NVLS_ENABLE=0
+
+# UV cache on shared filesystem (recommended for multi-node setups)
+# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync
+# export UV_CACHE_DIR="/path/to/shared/uv_cache"
+
+# HuggingFace cache directory (recommended for shared filesystem)
+# export HF_HOME="/path/to/shared/HF_HOME"
+
+# Authentication tokens
+# export HF_TOKEN="hf_your_token_here"
+
+# Make sure to upgrade container image to transformers >= 5.2.0 (required for Qwen3.5)
+# Run once: uv add "transformers>=5.2.0"
+
+# ==============================================================================
+# Job Execution
+# ==============================================================================
+
+echo "======================================"
+echo "Qwen3.5-VL Multi-Node Inference"
+echo "======================================"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Nodes: $SLURM_JOB_NUM_NODES"
+echo "GPUs per node: $SLURM_GPUS_PER_NODE"
+echo "Total GPUs: $((SLURM_JOB_NUM_NODES * SLURM_GPUS_PER_NODE))"
+echo "Model: $MODEL_NAME"
+echo "Parallelism: TP=$TP, PP=$PP, EP=$EP"
+echo "======================================"
+
+# Create logs directory
+mkdir -p logs
+
+# Calculate total processes
+TOTAL_GPUS=$((SLURM_JOB_NUM_NODES * SLURM_GPUS_PER_NODE))
+REQUIRED_GPUS=$(( (TP > EP ? TP : EP) * PP ))
+
+# Validate parallelism configuration
+if [ $REQUIRED_GPUS -ne $TOTAL_GPUS ]; then
+    echo "ERROR: Parallelism mismatch!"
+    echo "  max(TP, EP) × PP = max($TP, $EP) × $PP = $REQUIRED_GPUS"
+    echo "  Total allocated GPUs = $TOTAL_GPUS"
+    echo "  These must be equal!"
+    exit 1
+fi
+
+MEGATRON_CKPT_ARG=""
+if [ -n "$MEGATRON_CHECKPOINT" ]; then
+    MEGATRON_CKPT_ARG="--megatron_model_path $MEGATRON_CHECKPOINT"
+fi
+
+CMD="uv run --no-sync python examples/conversion/hf_to_megatron_generate_vlm.py \
+    --hf_model_path $MODEL_PATH \
+    $MEGATRON_CKPT_ARG \
+    --image_path \"$IMAGE_PATH\" \
+    --prompt \"$PROMPT\" \
+    --max_new_tokens $MAX_NEW_TOKENS \
+    --tp $TP \
+    --pp $PP \
+    --ep $EP"
+
+# Only rank 0 on each node runs uv sync
+SYNC_CMD="if [ \"\$SLURM_LOCALID\" -eq 0 ]; then uv sync; else sleep 5; fi"
+FULL_CMD="$SYNC_CMD && $CMD"
+
+echo "Executing inference..."
+echo "Command: $CMD"
+echo "======================================"
+
+# Execute based on container configuration
+if [ "$NO_CONTAINER" = true ]; then
+    echo "Running without container (bare metal)"
+    srun --mpi=pmix bash -c "$FULL_CMD"
+else
+    # Require container image
+    if [ -z "$CONTAINER_IMAGE" ]; then
+        echo "ERROR: CONTAINER_IMAGE must be set, or use NO_CONTAINER=true for bare metal."
+        exit 1
+    fi
+
+    echo "Running with container: $CONTAINER_IMAGE"
+
+    # Build srun command with container
+    SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE"
+
+    # Add container mounts
+    if [ -n "$CONTAINER_MOUNTS" ]; then
+        for mount in $CONTAINER_MOUNTS; do
+            SRUN_CMD="$SRUN_CMD --container-mounts=$mount"
+        done
+    fi
+
+    $SRUN_CMD bash -c "$FULL_CMD"
+fi
+
+echo "======================================"
+echo "Inference completed"
+echo "======================================"
diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py
@@ -182,6 +182,10 @@
     Qwen25VLBridge,
     Qwen25VLModel,
     Qwen25VLModelProvider,
+    Qwen35VLBridge,
+    Qwen35VLModelProvider,
+    Qwen35VLMoEBridge,
+    Qwen35VLMoEModelProvider,
 )
 from megatron.bridge.models.qwen_vl.modelling_qwen3_vl import (
     Qwen3VLBridge,
@@ -331,6 +335,10 @@
     "Qwen3VLMoEModelProvider",
     "Qwen3VLBridge",
     "Qwen3VLMoEBridge",
+    "Qwen35VLBridge",
+    "Qwen35VLModelProvider",
+    "Qwen35VLMoEBridge",
+    "Qwen35VLMoEModelProvider",
     "Gemma3VLBridge",
     "Gemma3VLModel",
     "Gemma3VLModelProvider",