diff --git a/examples/models/vlm/qwen3_vl/README.md b/examples/models/vlm/qwen3_vl/README.md
index 470b0d78f8..fd4c02d3a7 100644
--- a/examples/models/vlm/qwen3_vl/README.md
+++ b/examples/models/vlm/qwen3_vl/README.md
@@ -117,6 +117,20 @@ W&B report coming soon.
 
 **Note:** LoRA/DoRA significantly reduces memory requirements, allowing for larger batch sizes and fewer GPUs.
 
+## Finetuning with Energon Dataset
+
+Follow the instructions [here](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/multimodal#pretraining) to prepare `LLaVA-Pretrain` dataset in Energon format. Change the file `.nv-meta/dataset.yaml` to the following:
+
+```yaml
+__module__: megatron.bridge.recipes.qwen_vl.data.energon.task_encoder
+__class__: ChatMLWebdataset
+field_map:
+  imgs: jpg
+  conversation: json
+```
+
+Then, update the dataset path (`dataset.path=/path/to/energon/dataset`) in [energon_test.sh](energon_test.sh) and run the script.
+
 ## Evaluation
 
 Coming soon.
diff --git a/examples/models/vlm/qwen3_vl/energon_test.sh b/examples/models/vlm/qwen3_vl/energon_test.sh
new file mode 100755
index 0000000000..10188ac86d
--- /dev/null
+++ b/examples/models/vlm/qwen3_vl/energon_test.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+# Before training, make sure to set WANDB_API_KEY or disable wandb logging
+# export WANDB_API_KEY=<your_wandb_api_key>
+# export WANDB_MODE=disabled
+
+# Test Seq Packing configurations for LoRA finetuning on the dense model
+PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen3-VL-8B-Instruct
+MODEL_NAME=qwen3_vl_8b
+DATASET_NAME=energon
+SEQ_LENGTH=4096
+TRAIN_ITERS=50
+GLOBAL_BATCH_SIZE=32
+MICRO_BATCH_SIZE=2
+EVAL_ITERS=10
+LR=0.00005
+MIN_LR=0.000005
+LR_WARMUP_ITERS=10
+LOG_INTERVAL=1
+WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
+
+SEQ_PACKING_CONFIGS=(False True)
+
+# EP/TP/PP/CP/N_PROC combinations: "EP,TP,PP,CP,N_PROC" configurations
+# N_PROC is the total number of processes (GPUs) used for training
+# N_PROC is used to control DP size, to make the loss curves comparable
+PARALLELISM_CONFIGS=("1,1,1,4,8" "1,1,1,2,4" "1,1,1,1,2")
+
+for pack_config in "${SEQ_PACKING_CONFIGS[@]}"; do
+    for par_config in "${PARALLELISM_CONFIGS[@]}"; do
+        IFS=',' read -r EP TP PP CP N_PROC <<< "$par_config"
+        echo "Running LoRA finetuning pack_sequences_in_batch=$pack_config with EP=$EP TP=$TP PP=$PP CP=$CP N_PROC=$N_PROC"
+        uv run python -m torch.distributed.run --nproc_per_node=$N_PROC scripts/training/run_recipe.py \
+            --recipe ${MODEL_NAME}_finetune_config \
+            --step_func qwen3_vl_step \
+            --peft_scheme lora \
+            --dataset_type energon \
+            checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
+            model.seq_length=$SEQ_LENGTH \
+            train.train_iters=$TRAIN_ITERS \
+            train.global_batch_size=$GLOBAL_BATCH_SIZE \
+            train.micro_batch_size=$MICRO_BATCH_SIZE \
+            train.eval_iters=$EVAL_ITERS \
+            optimizer.lr=$LR \
+            optimizer.min_lr=$MIN_LR \
+            scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
+            checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_seq_pack_${pack_config}_cp${CP} \
+            logger.log_interval=$LOG_INTERVAL \
+            logger.wandb_project=$WANDB_PROJECT \
+            logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_seq_pack_${pack_config}_cp${CP} \
+            dataset.seq_length=$SEQ_LENGTH \
+            dataset.path=/path/to/energon/dataset \
+            dataset.pack_sequences_in_batch=$pack_config \
+            model.expert_model_parallel_size=$EP \
+            model.tensor_model_parallel_size=$TP \
+            model.pipeline_model_parallel_size=$PP \
+            model.context_parallel_size=$CP \
+            model.calculate_per_token_loss=True \
+            ddp.average_in_collective=False \
+            ddp.grad_reduce_in_fp32=True
+    done
+done
+
diff --git a/examples/models/vlm/qwen3_vl/peft_seq_unpacked.sh b/examples/models/vlm/qwen3_vl/peft_seq_unpacked.sh
old mode 100644
new mode 100755
diff --git a/scripts/training/run_recipe.py b/scripts/training/run_recipe.py
index 5b2524e0be..bb4a676678 100755
--- a/scripts/training/run_recipe.py
+++ b/scripts/training/run_recipe.py
@@ -132,6 +132,12 @@ def parse_args() -> tuple[argparse.Namespace, list[str]]:
         default=None,
         help="Sequence length for training",
     )
+    parser.add_argument(
+        "--dataset_type",
+        type=str,
+        default=None,
+        help="Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded').",
+    )
     args, cli_overrides = parser.parse_known_args()
     return args, cli_overrides
 
@@ -141,6 +147,7 @@ def load_recipe(
     peft_scheme: str | None,
     packed_sequence: bool = False,
     seq_length: int | None = None,
+    dataset_type: str | None = None,
 ) -> ConfigContainer:
     """
     Load recipe by name from megatron.bridge.recipes.
@@ -150,6 +157,7 @@ def load_recipe(
         peft_scheme: PEFT scheme to use ('lora', 'dora', or None)
         packed_sequence: Enable packed sequence training (default: False)
         seq_length: Sequence length for training (optional)
+        dataset_type: Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded')
 
     Returns:
         ConfigContainer from calling the recipe
@@ -175,11 +183,13 @@ def load_recipe(
         accepts_peft = "peft" in params or has_var_keyword
         accepts_packed_sequence = "packed_sequence" in params or has_var_keyword
         accepts_seq_length = "seq_length" in params or has_var_keyword
+        accepts_dataset_type = "dataset_type" in params or has_var_keyword
     except (ValueError, TypeError):
         # If signature inspection fails, fallback conservatively
         accepts_peft = True  # peft is widely supported, try passing it
         accepts_packed_sequence = False  # new parameter, don't pass if unsure
         accepts_seq_length = False  # new parameter, don't pass if unsure
+        accepts_dataset_type = False  # VLM-specific, don't pass if unsure
 
     # Build kwargs dynamically based on what the recipe accepts
     kwargs = {}
@@ -189,6 +199,8 @@ def load_recipe(
         kwargs["packed_sequence"] = packed_sequence
     if accepts_seq_length and seq_length is not None:
         kwargs["seq_length"] = seq_length
+    if accepts_dataset_type and dataset_type is not None:
+        kwargs["dataset_type"] = dataset_type
 
     try:
         return config_builder(**kwargs)
@@ -224,6 +236,7 @@ def main() -> None:
         args.peft_scheme,
         args.packed_sequence,
         args.seq_length,
+        args.dataset_type,
     )
 
     config = process_config_with_overrides(
diff --git a/src/megatron/bridge/data/energon/base_energon_datamodule.py b/src/megatron/bridge/data/energon/base_energon_datamodule.py
index c46970c9d5..37d691ad44 100644
--- a/src/megatron/bridge/data/energon/base_energon_datamodule.py
+++ b/src/megatron/bridge/data/energon/base_energon_datamodule.py
@@ -15,7 +15,7 @@
 import logging
 from typing import Any, Literal, Optional
 
-from megatron.core import parallel_state
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.energon import WorkerConfig, get_savable_loader, get_train_dataset
 
 
@@ -64,6 +64,7 @@ def __init__(
         decoder_seq_length: Optional[int] = None,
         packing_buffer_size: Optional[int] = None,
         validation_task_encoder: Optional[Any] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
         **kwargs,
     ) -> None:
         """
@@ -89,6 +90,8 @@ def __init__(
         packing_buffer_size (int, optional): Size of the packing buffer for batched samples. Defaults to None.
         validation_task_encoder (MultiModalTaskEncoder, optional): Encoder responsible for encoding
         and batching samples for validation. Defaults to None and will be the same as task_encoder.
+        pg_collection (ProcessGroupCollection, optional): Process group collection for distributed training.
+        If provided, used instead of the global parallel_state. Defaults to None.
         **kwargs: Additional keyword arguments. Will be passed to get_train_dataset() of Energon
         """
 
@@ -112,8 +115,49 @@ def __init__(
         self.packing_buffer_size = packing_buffer_size
         self.validation_task_encoder = validation_task_encoder or self.task_encoder
         self.num_val_workers = num_val_workers or self.num_workers
+        self.pg_collection = pg_collection
         self.kwargs = kwargs
 
+    def _build_worker_config(self, num_workers: int, split: str = "train") -> WorkerConfig:
+        """Build a WorkerConfig using pg_collection, falling back to default_worker_config.
+
+        NOTE: We intentionally use the pure DP rank (pg_collection.dp)
+        rather than the combined DP-CP rank. With Megatron's rank ordering
+        (default "tp-cp-ep-dp-pp"), all CP ranks within the same DP replica
+        already share the same pure DP rank. This ensures that CP ranks
+        processing different sequence portions of the same batch receive
+        identical data from the dataloader.
+        Using dp_cp would be INCORRECT here — it would assign each CP rank
+        a unique rank, causing them to read different data shards.
+        """
+        if self.pg_collection is None or self.pg_collection.dp is None:
+            logger.info(
+                f"Multimodal {split} data loader pg_collection is not available, "
+                f"using default worker config with num_workers {num_workers}"
+            )
+            return WorkerConfig.default_worker_config(num_workers)
+
+        rank = self.pg_collection.dp.rank()
+        world_size = self.pg_collection.dp.size()
+        data_parallel_group = self.pg_collection.dp
+        cp_rank = self.pg_collection.cp.rank() if self.pg_collection.cp is not None else 0
+        cp_size = self.pg_collection.cp.size() if self.pg_collection.cp is not None else 1
+
+        logger.info(
+            f"Multimodal {split} dataloader initializing with "
+            f"dp_rank {rank} dp_world_size {world_size} "
+            f"cp_rank {cp_rank} cp_size {cp_size} "
+            f"data_parallel_group {data_parallel_group}"
+        )
+        return WorkerConfig(
+            rank=rank,
+            world_size=world_size,
+            num_workers=num_workers,
+            data_parallel_group=data_parallel_group,
+            worker_debug_path=None,
+            worker_log_level=0,
+        )
+
     def datasets_provider(self, worker_config, split: Literal["train", "val"] = "val"):
         """
         Provide the dataset for training or validation.
@@ -165,28 +209,7 @@ def train_dataloader(self) -> Any:
         logger.info(f"Multimodal train dataloader initializing with init_global_step {self.init_global_step}")
         if self.train_dataloader_object:
             return self.train_dataloader_object
-        if not parallel_state.is_initialized():
-            logger.info(
-                f"Muiltimodal data loader parallel state is not initialized,"
-                f"using default worker config with no_workers {self.num_workers}"
-            )
-            worker_config = WorkerConfig.default_worker_config(self.num_workers)
-        else:
-            rank = parallel_state.get_data_parallel_rank()
-            world_size = parallel_state.get_data_parallel_world_size()
-            data_parallel_group = parallel_state.get_data_parallel_group()
-            logger.info(
-                f" Multimodal  train dataloader initializing with"
-                f"rank {rank} world_size {world_size} data_parallel_group {data_parallel_group} ****** "
-            )
-            worker_config = WorkerConfig(
-                rank=rank,
-                world_size=world_size,
-                num_workers=self.num_workers,
-                data_parallel_group=data_parallel_group,
-                worker_debug_path=None,
-                worker_log_level=0,
-            )
+        worker_config = self._build_worker_config(self.num_workers, split="train")
         train_dataset = self.datasets_provider(worker_config, split="train")
         energon_dataloader = get_savable_loader(train_dataset, worker_config=worker_config)
         self.train_dataloader_object = energon_dataloader
@@ -204,27 +227,7 @@ def val_dataloader(self):
         """
         if self.val_dataloader_object:
             return self.val_dataloader_object
-
-        if not parallel_state.is_initialized():
-            logger.info(
-                f"Muiltimodal val data loader parallel state is not initialized,"
-                f"using default worker config with no_workers {self.num_workers}"
-            )
-            worker_config = WorkerConfig.default_worker_config(self.num_val_workers)
-        else:
-            rank = parallel_state.get_data_parallel_rank()
-            world_size = parallel_state.get_data_parallel_world_size()
-            data_parallel_group = parallel_state.get_data_parallel_group()
-
-            logger.info(f"rank {rank} world_size {world_size} data_parallel_group {data_parallel_group}")
-            worker_config = WorkerConfig(
-                rank=rank,
-                world_size=world_size,
-                num_workers=self.num_workers,
-                data_parallel_group=data_parallel_group,
-                worker_debug_path=None,
-                worker_log_level=0,
-            )
+        worker_config = self._build_worker_config(self.num_val_workers, split="val")
         val_dataset = self.datasets_provider(worker_config, split="val")
         energon_loader = get_savable_loader(val_dataset, worker_config=worker_config)
         self.val_dataloader_object = energon_loader
diff --git a/src/megatron/bridge/data/energon/energon_provider.py b/src/megatron/bridge/data/energon/energon_provider.py
index c128fc5c89..ff7cbdd22b 100644
--- a/src/megatron/bridge/data/energon/energon_provider.py
+++ b/src/megatron/bridge/data/energon/energon_provider.py
@@ -33,6 +33,8 @@ class EnergonProvider(DatasetProvider):
     num_workers: int_repr
     dataloader_type: str = "external"
     task_encoder: Optional[Any] = None
+    # Enable batch-level online sequence packing
+    pack_sequences_in_batch: bool = False
 
     def build_datasets(self, context: DatasetBuildContext):
         dataset = EnergonMultiModalDataModule(
@@ -44,6 +46,7 @@ def build_datasets(self, context: DatasetBuildContext):
             micro_batch_size=self.micro_batch_size,
             global_batch_size=self.global_batch_size,
             num_workers=self.num_workers,
+            pg_collection=context.pg_collection,
         )
         return (
             iter(dataset.train_dataloader()),
diff --git a/src/megatron/bridge/data/utils.py b/src/megatron/bridge/data/utils.py
index 0c258c1de9..a5695f03cc 100644
--- a/src/megatron/bridge/data/utils.py
+++ b/src/megatron/bridge/data/utils.py
@@ -189,6 +189,7 @@ def protocol_adapter(
             train_val_test_num_samples: list[int],
             config: DatasetProvider,
             tokenizer: Optional[MegatronTokenizer] = None,
+            pg_collection: Optional[ProcessGroupCollection] = None,
         ) -> tuple[Optional[Any], Optional[Any], Optional[Any]]:
             """Adapter function that bridges the protocol interface with the legacy interface."""
             context = DatasetBuildContext(
@@ -196,6 +197,7 @@ def protocol_adapter(
                 valid_samples=train_val_test_num_samples[1],
                 test_samples=train_val_test_num_samples[2],
                 tokenizer=tokenizer,
+                pg_collection=pg_collection,
             )
             return config.build_datasets(context)
 
diff --git a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
index 05c163a24a..b6301ef5c5 100644
--- a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
+++ b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py
@@ -19,14 +19,16 @@
 import re
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import numpy as np
 import torch
 from megatron.energon import Batch, DefaultTaskEncoder
+from megatron.energon.epathlib.epath import EPath
 from megatron.energon.flavors.base_dataset import Sample
-from megatron.energon.task_encoder.cooking import Cooker, basic_sample_keys
-from PIL import Image
+from megatron.energon.flavors.webdataset import DefaultDecoderWebdatasetFactory
+from transformers import BatchEncoding
+from webdataset.autodecode import Decoder, imagehandler
 
 from megatron.bridge.training.utils.visual_inputs import Qwen2_5_VLVisualInputs
 
@@ -114,14 +116,14 @@ def process_vision(
             kwargs["min_pixels"] = min_pixels
         if max_pixels is not None:
             kwargs["max_pixels"] = max_pixels
-        image_inputs = processor(images=images, videos=None, return_tensors="pt", **kwargs)
+        image_inputs = processor(images=images, text="", videos=None, return_tensors="pt", **kwargs)
         image_grid_thw = image_inputs.get("image_grid_thw", None)
     else:
         image_inputs = {}
         image_grid_thw = None
 
     if videos is not None:
-        videos_inputs = processor(images=None, videos=videos, return_tensors="pt")
+        videos_inputs = processor(images=None, text="", videos=videos, return_tensors="pt")
         video_grid_thw = videos_inputs.get("video_grid_thw", None)
     else:
         videos_inputs = {}
@@ -152,15 +154,100 @@ def _get(token_str: str, default_id: int) -> int:
     return image_id, video_id
 
 
+def _tensor_to_pil(t):
+    """Convert a [C,H,W] float tensor in [0,1] to a PIL Image (uint8 [0,255])."""
+    from PIL import Image
+
+    img_np = (t.permute(1, 2, 0).numpy() * 255).clip(0, 255).astype(np.uint8)
+    return Image.fromarray(img_np)
+
+
+def _images_to_pil(imgs):
+    """Convert WDS tensor images to PIL to match HF flow input format.
+
+    WDS imagehandler decodes JPEG to float tensors in [0,1]. The HF flow passes
+    PIL images (uint8 [0,255]) to the processor. Converting to PIL here ensures
+    the processor applies identical rescaling and normalization in both flows.
+    """
+    if isinstance(imgs, torch.Tensor):
+        if imgs.dim() == 3:
+            return [_tensor_to_pil(imgs)]
+        elif imgs.dim() == 4:
+            return [_tensor_to_pil(img) for img in imgs]
+    elif isinstance(imgs, list):
+        return [_tensor_to_pil(img) if isinstance(img, torch.Tensor) else img for img in imgs]
+    return imgs
+
+
+def _videos_to_pil(videos):
+    """Convert WDS video frame tensors to PIL to match HF flow input format."""
+    if videos is None:
+        return None
+    result = []
+    for video in videos:
+        if isinstance(video, list):
+            result.append([_tensor_to_pil(f) if isinstance(f, torch.Tensor) else f for f in video])
+        elif isinstance(video, torch.Tensor):
+            if video.dim() == 4:
+                result.append([_tensor_to_pil(f) for f in video])
+            elif video.dim() == 3:
+                result.append([_tensor_to_pil(video)])
+            else:
+                result.append([video])
+        else:
+            result.append(video)
+    return result
+
+
 @dataclass
 class ChatMLSample(Sample):
-    """Intermediate Sample Format"""
+    """multi-turn complex samples with images and videos"""
 
-    # __key__: str
-    # __subflavors__: Dict
-    imgs: List[Image.Image]
-    videos: List[torch.Tensor | list[Image.Image]]
     conversation: str  # JSON string of GPT-format conversations
+    imgs: Optional[List[torch.Tensor]] = None
+    videos: Optional[List[List[torch.Tensor]]] = None
+
+
+class videohandler:
+    """Create a video handler."""
+
+    def __init__(self, imagespec):
+        self.extensions = ["jpgs", "mp4s", "videos"]
+        self.extensions_mapping = {"jpgs": "jpg", "mp4s": "jpg", "videos": "jpg"}
+        self.image_handler = imagehandler(imagespec)
+
+    def __call__(self, key, data):
+        """Perform nested image decoding."""
+        extension = re.sub(r".*[.]", "", key)
+        if extension.lower() not in self.extensions:
+            return None
+        data = pickle.loads(data)
+        key = self.extensions_mapping[extension]
+        if extension.lower() == "jpgs":
+            data = [self.image_handler(key, d) for d in data]
+        else:
+            data = [[self.image_handler(key, d) for d in video] for video in data]
+        return data
+
+
+class ChatMLWebdataset(DefaultDecoderWebdatasetFactory[ChatMLSample]):
+    """Webdataset factory for multi-turn ChatML samples with multimodal support.
+
+    Extends DefaultDecoderWebdatasetFactory to decode webdataset shards into
+    ChatMLSample instances, using custom handlers for image and video fields.
+    """
+
+    __sample_type__ = ChatMLSample
+
+    def __init__(self, path: EPath, *, auto_decode: bool = True, **kwargs):
+        super().__init__(path, auto_decode=auto_decode, **kwargs)
+        if auto_decode:
+            self._decoder = Decoder(
+                [
+                    imagehandler(self.image_decode),
+                    videohandler(self.image_decode),
+                ]
+            )
 
 
 @dataclass
@@ -231,51 +318,9 @@ def convert_to_qwenvl_content(user_input: str, image_pattern: str = "<image>", v
     return contents
 
 
-def cook_chatml_sample(sample: dict) -> ChatMLSample:
-    """
-    Convert crude sampel to ChatMLSample.
-
-    Args:
-        sample: Crude sample in pickle serialized format
-
-    Returns:
-        sample in ChatMLSample format
-    """
-    imgs = sample.get("jpgs", None)
-    if imgs:
-        imgs = pickle.loads(imgs)
-        if isinstance(imgs, list) and len(imgs) > 0:
-            imgs = [Image.fromarray(d) for d in imgs]
-        else:
-            imgs = None
-    videos = sample.get("videos", None)
-    if videos:
-        videos = pickle.loads(videos)
-        if isinstance(videos, list) and len(videos) > 0:
-            videos = [[d for d in video] for video in videos]
-        else:
-            videos = None
-    if "<image>" in sample["json"] and imgs is None:
-        logging.warning("<image> in conversation text but no image data")
-    if "<video>" in sample["json"] and videos is None:
-        logging.warning("<video> in conversation text but no video data")
-
-    chat_sample = ChatMLSample(
-        **basic_sample_keys(sample),
-        imgs=imgs,
-        videos=videos,
-        conversation=sample["json"],
-    )
-    return chat_sample
-
-
 class QwenVLTaskEncoder(DefaultTaskEncoder[ChatMLSample, QwenVLTaskSample, QwenVLTaskBatch, dict]):
     """A simple task encoder for captioning."""
 
-    cookers = [
-        Cooker(cook_chatml_sample),
-    ]
-
     def __init__(
         self,
         tokenizer,
@@ -313,12 +358,17 @@ def encode_sample(self, sample: ChatMLSample):
         Returns:
             sample with necessary fields
         """
-        # NOTE: flatten all images
-        #     Input of process_vision:
+        # NOTE: Convert WDS tensor images to PIL to match HF flow format.
+        #     WDS imagehandler decodes JPEG to float tensors in [0,1], but the processor
+        #     expects PIL images (uint8 [0,255]) for correct rescaling and normalization.
+        imgs_for_processing = _images_to_pil(sample.imgs) if sample.imgs is not None and len(sample.imgs) > 0 else None
+        videos_for_processing = (
+            _videos_to_pil(sample.videos) if sample.videos is not None and len(sample.videos) > 0 else None
+        )
         processed_vision = process_vision(
             self.image_processor,
-            sample.imgs,
-            sample.videos,
+            imgs_for_processing,
+            videos_for_processing,
             min_pixels=self.min_pixels,
             max_pixels=self.max_pixels,
         )
@@ -331,6 +381,7 @@ def encode_sample(self, sample: ChatMLSample):
             json.loads(sample.conversation) if isinstance(sample.conversation, (str, bytes)) else sample.conversation
         )
 
+        conversation = conversation if not isinstance(conversation, dict) else conversation.get("conversations", [])
         _from_system_ = "from" in conversation[0]
         role_key = "from" if "from" in conversation[0] else "role"
         content_key = "value" if "from" in conversation[0] else "content"
@@ -387,7 +438,7 @@ def encode_sample(self, sample: ChatMLSample):
         # NOTE: we need to mask all system/user input tokens and assistant generation prefix tokens
         # In transformers >= 5.0, apply_chat_template returns BatchEncoding when tokenize=True
         chat_output = self.hf_tokenizer.apply_chat_template(conversation, tokenize=True, return_tensors="np")
-        input_ids = chat_output["input_ids"][0] if isinstance(chat_output, dict) else chat_output[0]
+        input_ids = chat_output["input_ids"][0] if isinstance(chat_output, BatchEncoding) else chat_output[0]
         pad_token_id = self.hf_tokenizer.pad_token_id
         target = [pad_token_id for _ in range(len(input_ids))]
         search_start_index = 0
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
index f633357f08..5a60e3b8a7 100644
--- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
@@ -16,7 +16,7 @@
 from typing import List, Optional, Union
 
 import torch
-from transformers import AutoTokenizer, Qwen2VLImageProcessor
+from transformers import AutoTokenizer, Qwen3VLProcessor
 from typing_extensions import TypedDict, Unpack
 
 from megatron.bridge import AutoBridge
@@ -473,9 +473,9 @@ def _qwen3_vl_common(
         )
     elif _dataset_choice == "energon":
         tokenizer = AutoTokenizer.from_pretrained(_processor_model)
-        # Use from_pretrained to ensure correct normalization (mean/std) and config (min_pixels)
-        # matching Preloaded provider behavior.
-        image_processor = Qwen2VLImageProcessor.from_pretrained(_processor_model)
+        # Use Qwen3VLProcessor to match the HF flow (which uses AutoProcessor).
+        # This processor accepts both images and videos kwargs.
+        image_processor = Qwen3VLProcessor.from_pretrained(_processor_model)
 
         dataset_cfg = EnergonProvider(
             seq_length=seq_length,
diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py
index a19a7b8e78..8f3a5cdfa6 100644
--- a/src/megatron/bridge/training/config.py
+++ b/src/megatron/bridge/training/config.py
@@ -29,6 +29,7 @@
     ParamGroupOverride,
     ParamKey,
 )
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig
@@ -268,12 +269,14 @@ class DatasetBuildContext:
         valid_samples: Number of samples for validation dataset
         test_samples: Number of samples for test dataset
         tokenizer: Optional tokenizer instance for text processing
+        pg_collection: Optional process group collection for distributed training
     """
 
     train_samples: int
     valid_samples: int
     test_samples: int
     tokenizer: Optional[MegatronTokenizer] = None
+    pg_collection: Optional[ProcessGroupCollection] = None
 
 
 @dataclass(frozen=True)
diff --git a/src/megatron/bridge/training/setup.py b/src/megatron/bridge/training/setup.py
index c2081c29c1..cbed6603ff 100644
--- a/src/megatron/bridge/training/setup.py
+++ b/src/megatron/bridge/training/setup.py
@@ -287,6 +287,10 @@ def modelopt_pre_wrap_hook(model):
     timers("train/valid/test-data-iterators-setup", log_level=0).start(barrier=True)
     if "tokenizer" in inspect.signature(train_valid_test_datasets_provider).parameters:
         train_valid_test_datasets_provider = partial(train_valid_test_datasets_provider, tokenizer=tokenizer)
+    if "pg_collection" in inspect.signature(train_valid_test_datasets_provider).parameters:
+        train_valid_test_datasets_provider = partial(
+            train_valid_test_datasets_provider, pg_collection=pg_collection
+        )
 
     train_data_iterator, valid_data_iterator, test_data_iterator = setup_data_iterators(
         cfg=cfg,
diff --git a/tests/functional_tests/data/energon/test_base_energon_datamodule.py b/tests/functional_tests/data/energon/test_base_energon_datamodule.py
index 1b907d1cfa..577018f929 100644
--- a/tests/functional_tests/data/energon/test_base_energon_datamodule.py
+++ b/tests/functional_tests/data/energon/test_base_energon_datamodule.py
@@ -14,12 +14,14 @@
 
 import datetime
 import os
+import random
 from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
 import torch.distributed as dist
 from megatron.core import parallel_state
+from megatron.core.process_groups_config import ProcessGroupCollection
 
 from megatron.bridge.data.energon.base_energon_datamodule import (
     EnergonDataloader,
@@ -128,10 +130,11 @@ def mock_energon_dependencies(self):
     def test_datamodule_distributed_initialization(self, mock_energon_dependencies):
         """
         Test that the DataModule correctly initializes in a distributed environment
-        (using the real parallel_state).
+        (using pg_collection from parallel_state).
         """
 
         # 1. Initialization
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups()
         datamodule = EnergonMultiModalDataModule(
             path="/tmp/mock_dataset",
             tokenizer=MagicMock(),
@@ -140,21 +143,18 @@ def test_datamodule_distributed_initialization(self, mock_energon_dependencies):
             micro_batch_size=2,
             global_batch_size=4,
             num_workers=2,
+            pg_collection=pg_collection,
         )
 
         # 2. Build DataLoaders
-        # This triggers worker_config creation using real parallel_state
         train_loader, val_loader = datamodule.build()
 
         assert isinstance(train_loader, EnergonDataloader)
         assert isinstance(val_loader, EnergonDataloader)
 
-        # 3. Verify WorkerConfig was created correctly
-        # We can inspect the calls to get_train_dataset to see what worker_config was passed
+        # 3. Verify WorkerConfig was created correctly from pg_collection
         args, kwargs = mock_energon_dependencies["get_train_dataset"].call_args_list[0]  # First call (train)
         worker_config = kwargs["worker_config"]
-
-        # Verify worker config properties derived from parallel_state
         assert worker_config.rank == 0
         assert worker_config.world_size == 1
         assert worker_config.num_workers == 2
@@ -171,3 +171,545 @@ def test_datamodule_distributed_initialization(self, mock_energon_dependencies):
         # 5. State Saving
         state = train_loader.save_state()
         assert state == {"rank_state": 123}
+
+
+class TestEnergonDataModuleCPHandling:
+    """
+    Unit tests for Context Parallelism (CP) handling in the energon datamodule.
+
+    These tests use mock pg_collection to simulate various CP/DP configurations
+    without requiring real distributed initialization.
+    """
+
+    MODULE_PATH = "megatron.bridge.data.energon.base_energon_datamodule"
+
+    @pytest.fixture
+    def mock_energon(self):
+        """Mock energon dependencies (get_train_dataset, get_savable_loader)."""
+        with (
+            patch(f"{self.MODULE_PATH}.get_train_dataset") as mock_get_dataset,
+            patch(f"{self.MODULE_PATH}.get_savable_loader") as mock_get_loader,
+        ):
+            mock_dataset = MagicMock()
+            mock_get_dataset.return_value = mock_dataset
+
+            mock_loader = MagicMock()
+            mock_data = [{"id": i} for i in range(10)]
+            mock_loader.__iter__.side_effect = lambda: iter(mock_data)
+            mock_loader.save_state_rank.return_value = {"step": 0}
+            mock_get_loader.return_value = mock_loader
+
+            yield {
+                "get_train_dataset": mock_get_dataset,
+                "get_savable_loader": mock_get_loader,
+                "loader": mock_loader,
+            }
+
+    @staticmethod
+    def _make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1):
+        """Create a mock ProcessGroupCollection with configurable DP/CP ranks."""
+        mock_dp = MagicMock()
+        mock_dp.rank.return_value = dp_rank
+        mock_dp.size.return_value = dp_world_size
+
+        mock_cp = MagicMock()
+        mock_cp.rank.return_value = cp_rank
+        mock_cp.size.return_value = cp_size
+
+        pg_collection = MagicMock(spec=ProcessGroupCollection)
+        pg_collection.dp = mock_dp
+        pg_collection.cp = mock_cp
+        return pg_collection
+
+    def _make_datamodule(self, num_workers=2, num_val_workers=None, pg_collection=None, **kwargs):
+        """Helper to create an EnergonMultiModalDataModule with sensible defaults."""
+        return EnergonMultiModalDataModule(
+            path="/tmp/mock_dataset",
+            tokenizer=MagicMock(),
+            image_processor=MagicMock(),
+            seq_length=1024,
+            micro_batch_size=2,
+            global_batch_size=8,
+            num_workers=num_workers,
+            num_val_workers=num_val_workers,
+            pg_collection=pg_collection,
+            **kwargs,
+        )
+
+    def _get_worker_config(self, mocks, call_index=0):
+        """Extract the worker_config passed to get_train_dataset."""
+        return mocks["get_train_dataset"].call_args_list[call_index][1]["worker_config"]
+
+    # ----------------------------------------------------------------
+    # CP handling: train_dataloader
+    # ----------------------------------------------------------------
+
+    def test_train_dataloader_uses_pure_dp_rank_with_cp(self, mock_energon):
+        """
+        With CP=2 and DP=4, the train dataloader should use the pure DP rank
+        (not the combined DP-CP rank), so CP ranks within the same DP replica
+        read the same data shard.
+        """
+        pg = self._make_pg_collection(dp_rank=1, dp_world_size=4, cp_rank=1, cp_size=2)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        dm.train_dataloader()
+
+        wc = self._get_worker_config(mock_energon)
+        assert wc.rank == 1
+        assert wc.world_size == 4
+        # Verify pure DP group was used (not dp_cp)
+        pg.dp.rank.assert_called()
+        pg.dp.size.assert_called()
+
+    def test_train_cp_ranks_in_same_dp_replica_get_same_config(self, mock_energon):
+        """
+        Two CP ranks (cp_rank=0 and cp_rank=1) within the same DP replica (dp_rank=0)
+        should produce identical WorkerConfig (same rank, world_size).
+        """
+        configs = []
+        for cp_rank in [0, 1]:
+            # Reset mocks for each "rank"
+            mock_energon["get_train_dataset"].reset_mock()
+            mock_energon["get_savable_loader"].reset_mock()
+
+            pg = self._make_pg_collection(dp_rank=0, dp_world_size=2, cp_rank=cp_rank, cp_size=2)
+
+            dm = self._make_datamodule(pg_collection=pg)
+            dm.train_dataloader()
+            configs.append(self._get_worker_config(mock_energon))
+
+        assert configs[0].rank == configs[1].rank == 0
+        assert configs[0].world_size == configs[1].world_size == 2
+
+    def test_train_different_dp_ranks_get_different_config(self, mock_energon):
+        """Different DP ranks should receive different worker config ranks for data sharding."""
+        configs = []
+        for dp_rank in [0, 1]:
+            mock_energon["get_train_dataset"].reset_mock()
+            mock_energon["get_savable_loader"].reset_mock()
+
+            pg = self._make_pg_collection(dp_rank=dp_rank, dp_world_size=2, cp_rank=0, cp_size=2)
+
+            dm = self._make_datamodule(pg_collection=pg)
+            dm.train_dataloader()
+            configs.append(self._get_worker_config(mock_energon))
+
+        assert configs[0].rank == 0
+        assert configs[1].rank == 1
+        assert configs[0].world_size == configs[1].world_size == 2
+
+    def test_train_dataloader_cp1_equivalent_to_no_cp(self, mock_energon):
+        """With CP=1, behavior should be identical to no context parallelism."""
+        pg = self._make_pg_collection(dp_rank=3, dp_world_size=8, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        dm.train_dataloader()
+
+        wc = self._get_worker_config(mock_energon)
+        assert wc.rank == 3
+        assert wc.world_size == 8
+
+    # ----------------------------------------------------------------
+    # CP handling: val_dataloader
+    # ----------------------------------------------------------------
+
+    def test_val_dataloader_uses_pure_dp_rank_with_cp(self, mock_energon):
+        """Val dataloader should also use the pure DP rank, not combined DP-CP."""
+        pg = self._make_pg_collection(dp_rank=2, dp_world_size=4, cp_rank=1, cp_size=2)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        dm.val_dataloader()
+
+        wc = self._get_worker_config(mock_energon)
+        assert wc.rank == 2
+        assert wc.world_size == 4
+
+    def test_val_dataloader_uses_num_val_workers(self, mock_energon):
+        """Val dataloader should use num_val_workers, not num_workers."""
+        pg = self._make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(num_workers=4, num_val_workers=8, pg_collection=pg)
+        dm.val_dataloader()
+
+        wc = self._get_worker_config(mock_energon)
+        assert wc.num_workers == 8
+
+    def test_val_dataloader_defaults_num_val_workers_to_num_workers(self, mock_energon):
+        """When num_val_workers is not set, it should default to num_workers."""
+        pg = self._make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(num_workers=6, pg_collection=pg)
+        dm.val_dataloader()
+
+        wc = self._get_worker_config(mock_energon)
+        assert wc.num_workers == 6
+
+    # ----------------------------------------------------------------
+    # No pg_collection fallback
+    # ----------------------------------------------------------------
+
+    @pytest.mark.usefixtures("mock_energon")
+    def test_train_dataloader_no_pg_collection_uses_default_worker_config(self):
+        """When pg_collection is None, should use WorkerConfig.default_worker_config."""
+        with patch(f"{self.MODULE_PATH}.WorkerConfig") as mock_wc_cls:
+            mock_default_wc = MagicMock()
+            mock_wc_cls.default_worker_config.return_value = mock_default_wc
+
+            dm = self._make_datamodule(num_workers=3)
+            dm.train_dataloader()
+
+            mock_wc_cls.default_worker_config.assert_called_once_with(3)
+
+    @pytest.mark.usefixtures("mock_energon")
+    def test_val_dataloader_no_pg_collection_uses_default_worker_config(self):
+        """Val path should use num_val_workers with default_worker_config when pg_collection is None."""
+        with patch(f"{self.MODULE_PATH}.WorkerConfig") as mock_wc_cls:
+            mock_default_wc = MagicMock()
+            mock_wc_cls.default_worker_config.return_value = mock_default_wc
+
+            dm = self._make_datamodule(num_workers=3, num_val_workers=5)
+            dm.val_dataloader()
+
+            mock_wc_cls.default_worker_config.assert_called_once_with(5)
+
+    # ----------------------------------------------------------------
+    # Dataloader caching
+    # ----------------------------------------------------------------
+
+    def test_train_dataloader_is_cached(self, mock_energon):
+        """Second call to train_dataloader should not rebuild the underlying loader."""
+        pg = self._make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        loader1 = dm.train_dataloader()
+        assert isinstance(loader1, EnergonDataloader)
+
+        dm.train_dataloader()
+
+        # get_savable_loader should only be called once — the underlying
+        # energon loader is cached and not recreated on subsequent calls.
+        assert mock_energon["get_savable_loader"].call_count == 1
+
+    def test_val_dataloader_is_cached(self, mock_energon):
+        """Second call to val_dataloader should not rebuild the underlying loader."""
+        pg = self._make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        loader1 = dm.val_dataloader()
+        assert isinstance(loader1, EnergonDataloader)
+
+        dm.val_dataloader()
+
+        assert mock_energon["get_savable_loader"].call_count == 1
+
+    # ----------------------------------------------------------------
+    # datasets_provider and other edge cases
+    # ----------------------------------------------------------------
+
+    def test_datasets_provider_invalid_split_raises(self):
+        """datasets_provider should raise ValueError for invalid split names."""
+        dm = self._make_datamodule()
+        with pytest.raises(ValueError, match="Invalid value for split"):
+            dm.datasets_provider(MagicMock(), split="test")
+
+    def test_datasets_provider_uses_validation_task_encoder_for_val(self, mock_energon):
+        """Val split should use validation_task_encoder, not the train task_encoder."""
+        train_encoder = MagicMock(name="train_encoder")
+        val_encoder = MagicMock(name="val_encoder")
+
+        dm = self._make_datamodule(task_encoder=train_encoder, validation_task_encoder=val_encoder)
+        dm.datasets_provider(MagicMock(), split="val")
+
+        _, kwargs = mock_energon["get_train_dataset"].call_args
+        assert kwargs["task_encoder"] is val_encoder
+
+    def test_datasets_provider_uses_train_task_encoder_for_train(self, mock_energon):
+        """Train split should use the train task_encoder."""
+        train_encoder = MagicMock(name="train_encoder")
+        val_encoder = MagicMock(name="val_encoder")
+
+        dm = self._make_datamodule(task_encoder=train_encoder, validation_task_encoder=val_encoder)
+        dm.datasets_provider(MagicMock(), split="train")
+
+        _, kwargs = mock_energon["get_train_dataset"].call_args
+        assert kwargs["task_encoder"] is train_encoder
+
+    def test_test_dataloader_returns_none(self):
+        """test_dataloader should return None."""
+        dm = self._make_datamodule()
+        assert dm.test_dataloader() is None
+
+    @pytest.mark.usefixtures("mock_energon")
+    def test_build_returns_train_and_val(self):
+        """build() should return (train_dataloader, val_dataloader)."""
+        pg = self._make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        train_loader, val_loader = dm.build()
+
+        assert isinstance(train_loader, EnergonDataloader)
+        assert isinstance(val_loader, EnergonDataloader)
+
+    @pytest.mark.usefixtures("mock_energon")
+    def test_energon_dataloader_cyclic_iteration(self):
+        """EnergonDataloader should cycle through data indefinitely."""
+        pg = self._make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        loader = dm.train_dataloader()
+
+        # Iterate beyond the 10 mock items to verify cycling
+        items = [next(loader) for _ in range(12)]
+        assert items[0] == {"id": 0}
+        assert items[9] == {"id": 9}
+        # Item 10 should cycle back to the beginning
+        assert items[10] == {"id": 0}
+
+    def test_energon_dataloader_save_state(self, mock_energon):
+        """EnergonDataloader.save_state() should delegate to the underlying loader."""
+        pg = self._make_pg_collection(dp_rank=0, dp_world_size=1, cp_rank=0, cp_size=1)
+
+        dm = self._make_datamodule(pg_collection=pg)
+        loader = dm.train_dataloader()
+        state = loader.save_state()
+
+        assert state == {"step": 0}
+        mock_energon["loader"].save_state_rank.assert_called_once()
+
+
+class TestEnergonDataShardingVerification:
+    """
+    Verification tests that simulate multi-rank data loading to confirm:
+    - Different DP ranks receive different batches
+    - CP ranks within the same DP group receive the same input_ids
+    - Reproducibility with the same seed
+
+    These tests use rank-aware mock loaders that produce deterministic,
+    rank-specific data — simulating how energon shards by WorkerConfig.rank.
+    """
+
+    MODULE_PATH = "megatron.bridge.data.energon.base_energon_datamodule"
+    SEQ_LENGTH = 16
+    VOCAB_SIZE = 32000
+    NUM_BATCHES = 5
+    MICRO_BATCH_SIZE = 2
+
+    @staticmethod
+    def _generate_rank_batches(rank, seed, num_batches, seq_length, micro_batch_size, vocab_size):
+        """
+        Generate deterministic batches for a given (rank, seed) pair.
+
+        Simulates energon's behavior: same (rank, seed) always produces the
+        same sequence of batches; different ranks produce different sequences.
+        """
+        rng = random.Random(seed * 1000 + rank)
+        batches = []
+        for _ in range(num_batches):
+            input_ids = torch.tensor(
+                [[rng.randint(0, vocab_size - 1) for _ in range(seq_length)] for _ in range(micro_batch_size)]
+            )
+            batches.append({"input_ids": input_ids})
+        return batches
+
+    def _make_rank_aware_energon_mocks(self, seed):
+        """
+        Create mock get_savable_loader that returns a loader whose data
+        is deterministic per (WorkerConfig.rank, seed).
+        """
+
+        def loader_factory(dataset, worker_config):
+            rank = worker_config.rank
+            batches = self._generate_rank_batches(
+                rank=rank,
+                seed=seed,
+                num_batches=self.NUM_BATCHES,
+                seq_length=self.SEQ_LENGTH,
+                micro_batch_size=self.MICRO_BATCH_SIZE,
+                vocab_size=self.VOCAB_SIZE,
+            )
+            mock_loader = MagicMock()
+            mock_loader.__iter__.side_effect = lambda: iter(batches)
+            mock_loader.save_state_rank.return_value = {}
+            return mock_loader
+
+        return loader_factory
+
+    @staticmethod
+    def _make_pg_collection(dp_rank, dp_world_size, cp_rank, cp_size):
+        """Create a mock ProcessGroupCollection with configurable DP/CP ranks."""
+        mock_dp = MagicMock()
+        mock_dp.rank.return_value = dp_rank
+        mock_dp.size.return_value = dp_world_size
+
+        mock_cp = MagicMock()
+        mock_cp.rank.return_value = cp_rank
+        mock_cp.size.return_value = cp_size
+
+        pg_collection = MagicMock(spec=ProcessGroupCollection)
+        pg_collection.dp = mock_dp
+        pg_collection.cp = mock_cp
+        return pg_collection
+
+    def _build_loader_for_rank(self, dp_rank, dp_world_size, cp_rank, cp_size, seed):
+        """
+        Construct a datamodule + dataloader for a simulated (dp_rank, cp_rank),
+        using rank-aware mocks. Returns the list of batches drawn from the loader.
+        """
+        with (
+            patch(f"{self.MODULE_PATH}.get_train_dataset") as mock_get_dataset,
+            patch(f"{self.MODULE_PATH}.get_savable_loader") as mock_get_loader,
+        ):
+            mock_get_dataset.return_value = MagicMock()
+            mock_get_loader.side_effect = self._make_rank_aware_energon_mocks(seed)
+
+            pg = self._make_pg_collection(dp_rank, dp_world_size, cp_rank, cp_size)
+            dm = EnergonMultiModalDataModule(
+                path="/tmp/mock_dataset",
+                tokenizer=MagicMock(),
+                image_processor=MagicMock(),
+                seq_length=self.SEQ_LENGTH,
+                micro_batch_size=self.MICRO_BATCH_SIZE,
+                global_batch_size=self.MICRO_BATCH_SIZE * dp_world_size,
+                num_workers=1,
+                pg_collection=pg,
+            )
+            loader = dm.train_dataloader()
+            batches = [next(loader) for _ in range(self.NUM_BATCHES)]
+            return batches
+
+    # ----------------------------------------------------------------
+    # Verification: Different DP ranks receive different batches
+    # ----------------------------------------------------------------
+
+    def test_different_dp_ranks_receive_different_batches(self):
+        """
+        Simulate DP=2, CP=2.
+
+        DP rank 0 and DP rank 1 should receive DIFFERENT batches,
+        since they load different shards of the dataset.
+        """
+        seed = 42
+        dp0_batches = self._build_loader_for_rank(dp_rank=0, dp_world_size=2, cp_rank=0, cp_size=2, seed=seed)
+        dp1_batches = self._build_loader_for_rank(dp_rank=1, dp_world_size=2, cp_rank=0, cp_size=2, seed=seed)
+
+        # At least one batch must differ between the two DP ranks
+        all_same = all(
+            torch.equal(dp0_batches[i]["input_ids"], dp1_batches[i]["input_ids"]) for i in range(self.NUM_BATCHES)
+        )
+        assert not all_same, (
+            "DP rank 0 and DP rank 1 produced identical batches — data is not being sharded across DP ranks"
+        )
+
+    # ----------------------------------------------------------------
+    # Verification: CP ranks in same DP group receive same input_ids
+    # ----------------------------------------------------------------
+
+    def test_cp_ranks_in_same_dp_group_receive_same_input_ids(self):
+        """
+        Simulate DP=2, CP=2.
+
+        CP rank 0 and CP rank 1 within the SAME DP group (dp_rank=0)
+        should receive IDENTICAL input_ids, because they process
+        different sequence portions of the same batch.
+        """
+        seed = 42
+        cp0_batches = self._build_loader_for_rank(dp_rank=0, dp_world_size=2, cp_rank=0, cp_size=2, seed=seed)
+        cp1_batches = self._build_loader_for_rank(dp_rank=0, dp_world_size=2, cp_rank=1, cp_size=2, seed=seed)
+
+        for i in range(self.NUM_BATCHES):
+            assert torch.equal(cp0_batches[i]["input_ids"], cp1_batches[i]["input_ids"]), (
+                f"Batch {i}: CP rank 0 and CP rank 1 within DP group 0 "
+                f"received different input_ids.\n"
+                f"  cp0: {cp0_batches[i]['input_ids'][0, :8].tolist()}...\n"
+                f"  cp1: {cp1_batches[i]['input_ids'][0, :8].tolist()}..."
+            )
+
+    def test_cp_ranks_in_second_dp_group_also_match(self):
+        """
+        Same verification for the second DP group (dp_rank=1):
+        CP rank 0 and CP rank 1 should still receive identical input_ids.
+        """
+        seed = 42
+        cp0_batches = self._build_loader_for_rank(dp_rank=1, dp_world_size=2, cp_rank=0, cp_size=2, seed=seed)
+        cp1_batches = self._build_loader_for_rank(dp_rank=1, dp_world_size=2, cp_rank=1, cp_size=2, seed=seed)
+
+        for i in range(self.NUM_BATCHES):
+            assert torch.equal(cp0_batches[i]["input_ids"], cp1_batches[i]["input_ids"]), (
+                f"Batch {i}: CP rank 0 and CP rank 1 within DP group 1 received different input_ids"
+            )
+
+    # ----------------------------------------------------------------
+    # Verification: Reproducibility with same seed
+    # ----------------------------------------------------------------
+
+    def test_same_seed_produces_identical_batches(self):
+        """
+        Two independent runs with the same (dp_rank, cp_rank, seed)
+        should produce byte-identical batches.
+        """
+        seed = 42
+        run1 = self._build_loader_for_rank(dp_rank=0, dp_world_size=4, cp_rank=0, cp_size=2, seed=seed)
+        run2 = self._build_loader_for_rank(dp_rank=0, dp_world_size=4, cp_rank=0, cp_size=2, seed=seed)
+
+        for i in range(self.NUM_BATCHES):
+            assert torch.equal(run1[i]["input_ids"], run2[i]["input_ids"]), (
+                f"Batch {i}: Two runs with identical config produced different data — not reproducible"
+            )
+
+    def test_different_seed_produces_different_batches(self):
+        """
+        Same rank but different seeds should produce different batches,
+        confirming the seed actually controls randomness.
+        """
+        batches_seed42 = self._build_loader_for_rank(dp_rank=0, dp_world_size=2, cp_rank=0, cp_size=1, seed=42)
+        batches_seed99 = self._build_loader_for_rank(dp_rank=0, dp_world_size=2, cp_rank=0, cp_size=1, seed=99)
+
+        all_same = all(
+            torch.equal(batches_seed42[i]["input_ids"], batches_seed99[i]["input_ids"])
+            for i in range(self.NUM_BATCHES)
+        )
+        assert not all_same, "Different seeds produced identical batches — seed has no effect"
+
+    # ----------------------------------------------------------------
+    # Verification: Full rank matrix (DP=2, CP=2 → 4 ranks)
+    # ----------------------------------------------------------------
+
+    def test_full_dp2_cp2_rank_matrix(self):
+        """
+        Exhaustive check for a DP=2, CP=2 setup (4 total data-loading ranks).
+
+        Expected behavior:
+          Global rank 0 (dp=0, cp=0) ─┐ same input_ids
+          Global rank 1 (dp=0, cp=1) ─┘
+          Global rank 2 (dp=1, cp=0) ─┐ same input_ids (but different from dp=0)
+          Global rank 3 (dp=1, cp=1) ─┘
+        """
+        seed = 123
+        batches = {}
+        for dp_rank in [0, 1]:
+            for cp_rank in [0, 1]:
+                batches[(dp_rank, cp_rank)] = self._build_loader_for_rank(
+                    dp_rank=dp_rank, dp_world_size=2, cp_rank=cp_rank, cp_size=2, seed=seed
+                )
+
+        # Within DP group 0: cp=0 and cp=1 must match
+        for i in range(self.NUM_BATCHES):
+            assert torch.equal(
+                batches[(0, 0)][i]["input_ids"],
+                batches[(0, 1)][i]["input_ids"],
+            ), f"Batch {i}: DP group 0 — CP ranks diverged"
+
+        # Within DP group 1: cp=0 and cp=1 must match
+        for i in range(self.NUM_BATCHES):
+            assert torch.equal(
+                batches[(1, 0)][i]["input_ids"],
+                batches[(1, 1)][i]["input_ids"],
+            ), f"Batch {i}: DP group 1 — CP ranks diverged"
+
+        # Across DP groups: dp=0 and dp=1 must differ
+        all_same = all(
+            torch.equal(batches[(0, 0)][i]["input_ids"], batches[(1, 0)][i]["input_ids"])
+            for i in range(self.NUM_BATCHES)
+        )
+        assert not all_same, "DP group 0 and DP group 1 produced identical batches"
diff --git a/tests/functional_tests/data/energon/test_energon_prodvider.py b/tests/functional_tests/data/energon/test_energon_prodvider.py
index 6f920dd998..707f544585 100644
--- a/tests/functional_tests/data/energon/test_energon_prodvider.py
+++ b/tests/functional_tests/data/energon/test_energon_prodvider.py
@@ -70,6 +70,7 @@ def test_init_and_build_datasets(self, mock_datamodule_cls):
             micro_batch_size=params["micro_batch_size"],
             global_batch_size=params["global_batch_size"],
             num_workers=params["num_workers"],
+            pg_collection=context.pg_collection,
         )
 
         # Check dataloader calls
diff --git a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
index 0dcff4f5cd..bb4e83abbd 100644
--- a/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
+++ b/tests/unit_tests/recipes/qwen_vl/data/energon/test_task_encoder.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import io
 import json
 import pickle
 import unittest
@@ -27,11 +28,12 @@
     QwenVLTaskBatch,
     QwenVLTaskEncoder,
     QwenVLTaskSample,
+    _resolve_hf_mm_token_ids,
     convert_to_qwenvl_content,
-    cook_chatml_sample,
     find_pattern_indices,
     get_ltor_masks_and_position_ids,
     process_vision,
+    videohandler,
 )
 
 
@@ -84,20 +86,64 @@ def test_get_ltor_masks_and_position_ids(self):
         self.assertEqual(pos_ids.shape, (1, 3))
         self.assertTrue(torch.all(loss_mask == 1.0))
 
-    def test_cook_chatml_sample(self):
-        sample_dict = {
-            "__key__": "test_key",
-            "__restore_key__": "test_restore_key",
-            "__subflavor__": {},
-            "__subflavors__": {},
-            "json": json.dumps([{"role": "user", "content": "hi"}]),
-            "jpgs": pickle.dumps([np.zeros((10, 10, 3), dtype=np.uint8)]),
-            "videos": pickle.dumps([]),
-        }
-        sample = cook_chatml_sample(sample_dict)
-        self.assertIsInstance(sample, ChatMLSample)
-        self.assertEqual(len(sample.imgs), 1)
-        self.assertIsInstance(sample.imgs[0], Image.Image)
+
+class TestResolveHfMmTokenIds(unittest.TestCase):
+    def test_resolves_from_tokenizer_attributes(self):
+        tokenizer = MagicMock()
+        tokenizer.image_token_id = 100
+        tokenizer.video_token_id = 200
+        image_id, video_id = _resolve_hf_mm_token_ids(tokenizer)
+        self.assertEqual(image_id, 100)
+        self.assertEqual(video_id, 200)
+
+    def test_falls_back_to_convert_tokens_to_ids(self):
+        tokenizer = MagicMock()
+        tokenizer.image_token_id = None
+        tokenizer.video_token_id = None
+        tokenizer.convert_tokens_to_ids.side_effect = lambda x: {"<image>": 300, "<video>": 400}[x]
+        image_id, video_id = _resolve_hf_mm_token_ids(tokenizer)
+        self.assertEqual(image_id, 300)
+        self.assertEqual(video_id, 400)
+
+    def test_returns_defaults_when_all_fail(self):
+        tokenizer = MagicMock()
+        tokenizer.image_token_id = None
+        tokenizer.video_token_id = None
+        tokenizer.convert_tokens_to_ids.side_effect = Exception("not found")
+        image_id, video_id = _resolve_hf_mm_token_ids(tokenizer)
+        self.assertEqual(image_id, 151655)
+        self.assertEqual(video_id, 151656)
+
+
+class TestVideoHandler(unittest.TestCase):
+    def setUp(self):
+        self.handler = videohandler("pilrgb")
+
+    def _make_jpeg_bytes(self, color="red"):
+        img = Image.new("RGB", (4, 4), color=color)
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG")
+        return buf.getvalue()
+
+    def test_returns_none_for_non_matching_extension(self):
+        result = self.handler("sample.txt", b"data")
+        self.assertIsNone(result)
+
+    def test_decodes_jpgs(self):
+        images_bytes = [self._make_jpeg_bytes() for _ in range(2)]
+        data = pickle.dumps(images_bytes)
+        result = self.handler("sample.jpgs", data)
+        self.assertIsNotNone(result)
+        self.assertEqual(len(result), 2)
+
+    def test_decodes_mp4s(self):
+        frames = [self._make_jpeg_bytes("blue") for _ in range(3)]
+        videos = [frames]  # one video with 3 frames
+        data = pickle.dumps(videos)
+        result = self.handler("sample.mp4s", data)
+        self.assertIsNotNone(result)
+        self.assertEqual(len(result), 1)
+        self.assertEqual(len(result[0]), 3)
 
 
 class TestQwenVLTaskEncoder(unittest.TestCase):
@@ -191,6 +237,72 @@ def processor_side_effect(images=None, videos=None, **kwargs):
         expanded_len = original_len - 1 + 196
         self.assertEqual(len(encoded.text), expanded_len)
 
+    def test_encode_sample_from_value_format(self):
+        """Test encode_sample with 'from'/'value' conversation format."""
+
+        def processor_side_effect(images=None, videos=None, **kwargs):
+            res = {}
+            if images:
+                res["image_grid_thw"] = np.array([[1, 28, 28]])
+                res["pixel_values"] = torch.randn(1, 3, 28, 28)
+            if videos:
+                res["video_grid_thw"] = np.array([[1, 28, 28]])
+                res["pixel_values_videos"] = torch.randn(1, 3, 28, 28)
+            return res
+
+        self.image_processor.side_effect = processor_side_effect
+
+        self.tokenizer.apply_chat_template.return_value = [np.array([10, 11, 151655, 12, 13])]
+        self.tokenizer.encode.side_effect = lambda x, **kwargs: [12, 13] if x == "Nice" else [999]
+
+        sample = ChatMLSample(
+            __key__="key",
+            __restore_key__="restore_key",
+            __subflavor__={},
+            __subflavors__={},
+            imgs=[MagicMock(spec=Image.Image)],
+            videos=[],
+            conversation=json.dumps(
+                [
+                    {"from": "human", "value": "Look <image>"},
+                    {"from": "gpt", "value": "Nice"},
+                ]
+            ),
+        )
+
+        encoded = self.encoder.encode_sample(sample)
+        self.assertIsInstance(encoded, QwenVLTaskSample)
+        self.assertTrue(torch.is_tensor(encoded.text))
+        self.assertTrue(torch.is_tensor(encoded.target))
+        # Same expansion as role/content format: 5 - 1 + 196 = 200
+        self.assertEqual(len(encoded.text), 200)
+
+    def test_encode_sample_text_only(self):
+        """Test encode_sample with no images or videos."""
+        self.tokenizer.apply_chat_template.return_value = [np.array([10, 11, 12, 13])]
+        self.tokenizer.encode.side_effect = lambda x, **kwargs: [12, 13] if x == "Hello" else [999]
+
+        sample = ChatMLSample(
+            __key__="key",
+            __restore_key__="restore_key",
+            __subflavor__={},
+            __subflavors__={},
+            imgs=None,
+            videos=None,
+            conversation=json.dumps(
+                [
+                    {"role": "user", "content": "Hi"},
+                    {"role": "assistant", "content": "Hello"},
+                ]
+            ),
+        )
+
+        encoded = self.encoder.encode_sample(sample)
+        self.assertIsInstance(encoded, QwenVLTaskSample)
+        self.assertEqual(len(encoded.text), 4)  # no expansion
+        self.assertEqual(len(encoded.imgs), 0)
+        self.assertEqual(len(encoded.videos), 0)
+
     def test_batch(self):
         # Create dummy encoded samples
         s1 = QwenVLTaskSample(