diff --git a/nemo/collections/multimodal/data/energon/base.py b/nemo/collections/multimodal/data/energon/base.py
index 34752c878b1d3..0a99b1a1baad5 100644
--- a/nemo/collections/multimodal/data/energon/base.py
+++ b/nemo/collections/multimodal/data/energon/base.py
@@ -11,23 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Any, Dict, Literal, Optional
 
+from copy import deepcopy
+from typing import Any, Dict, Literal, Optional
+
+import fiddle as fdl
 import pytorch_lightning as pl
 from megatron.core import parallel_state
 from megatron.energon import WorkerConfig, get_savable_loader, get_train_dataset
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils.data import DataLoader
+from typing_extensions import Self
 
 from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
 from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder
-from nemo.lightning.io.mixin import IOMixin
+from nemo.lightning.io.mixin import IOMixin, serialization, track_io
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.utils import logging
 
-if TYPE_CHECKING:
-    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-
 
 class SimpleMultiModalDataModule(pl.LightningDataModule, IOMixin):
     """
@@ -66,6 +67,7 @@ def __init__(
         pin_memory: bool = True,
         multimodal_sample_config: Optional[MultiModalSampleConfig] = MultiModalSampleConfig(),
         task_encoder: Optional[MultiModalTaskEncoder] = None,
+        decoder_seq_length: Optional[int] = None,
     ) -> None:
         """
         Initialize the SimpleMultiModalDataModule.
@@ -87,6 +89,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.image_processor = image_processor
         self.seq_length = seq_length
+        self.decoder_seq_length = decoder_seq_length
         self.micro_batch_size = micro_batch_size
         self.global_batch_size = global_batch_size
         self.num_workers = num_workers
@@ -99,11 +102,24 @@ def __init__(
         )
         self.init_global_step = 0
         self.data_sampler = SequentialMegatronSampler(
-            seq_len=self.seq_length, micro_batch_size=self.micro_batch_size, global_batch_size=self.global_batch_size
+            seq_len=self.seq_length,
+            decoder_seq_len=self.decoder_seq_length,
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
         )
         self.train_dataloader_object = None
         self.val_dataloader_object = None
 
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        # (pleasefixme) image_processor and task_encoder are problematic with Fiddle so we skip serializing them for now
+        cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items() if k not in ['image_processor', 'task_encoder']}
+
+        for val in cfg_kwargs.values():
+            if not serialization.find_node_traverser(type(val)):
+                track_io(type(val))
+        cfg = fdl.Config(type(self), **cfg_kwargs)
+        return cfg
+
     def datasets_provider(self, worker_config, split: Literal['train', 'val'] = 'val'):
         """
         Provide the dataset for training or validation.
@@ -315,6 +331,7 @@ def __init__(
         micro_batch_size: int = 4,
         global_batch_size: int = 8,
         init_consumed_samples: int = 0,
+        decoder_seq_len: Optional[int] = None,
         init_global_step=0,
     ):
         """
@@ -328,6 +345,7 @@ def __init__(
         """
         super().__init__(
             seq_len=seq_len,
+            decoder_seq_len=decoder_seq_len,
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
             init_consumed_samples=init_consumed_samples,
diff --git a/nemo/collections/multimodal/data/energon/config.py b/nemo/collections/multimodal/data/energon/config.py
index 45ca8e9db8006..c145c5e510198 100644
--- a/nemo/collections/multimodal/data/energon/config.py
+++ b/nemo/collections/multimodal/data/energon/config.py
@@ -15,7 +15,7 @@
 from dataclasses import dataclass, field
 from typing import List
 import torch
-from nemo.collections.multimodal.data.energon.conversation import BaseConversationTemplateConfig
+from nemo.collections.multimodal.data.energon.conversation import LLaVATemplateConfig
 
 
 @dataclass
@@ -56,12 +56,6 @@ class ImageTextRawBatch:
     loss_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
 
 
-class LLaVATemplateConfig(BaseConversationTemplateConfig):
-    """LLava specific template configuration which extends the base config"""
-
-    pass
-
-
 @dataclass
 class MultiModalSampleConfig:
     image_token: ImageToken = field(default_factory=ImageToken)
diff --git a/nemo/collections/multimodal/data/energon/conversation.py b/nemo/collections/multimodal/data/energon/conversation.py
index 3342b7e9a411a..f0749e47dc12e 100644
--- a/nemo/collections/multimodal/data/energon/conversation.py
+++ b/nemo/collections/multimodal/data/energon/conversation.py
@@ -19,6 +19,15 @@
 class BaseConversationTemplateConfig:
     """Conversation template config related parameters"""
 
+    system: Optional[str] = "".format()  # fmt: off
+    roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
+    stop_string: Optional[str] = None
+    chat_template = None
+
+
+class LLaVATemplateConfig(BaseConversationTemplateConfig):
+    """LLava specific template configuration which extends the base config"""
+
     system: Optional[str] = (
         "A chat between a curious user and artificial assistant agent. The assistant gives helpful, detailed and polite answers to user's questions.".format()
     )  # fmt: off
@@ -36,3 +45,14 @@ class BaseConversationTemplateConfig:
         {%- endif %}
     {%- endfor -%}
     """
+
+
+class MLlamaTemplateConfig(BaseConversationTemplateConfig):
+    """LLava specific template configuration which extends the base config"""
+
+    system: Optional[str] = None
+    roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
+    stop_string: str = None
+    chat_template = """
+    '{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now("%d %b %Y") %}\n    {%- else %}\n        {%- set date_string = "26 Jul 2024" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- Find out if there are any images #}\n{% set image_ns = namespace(has_images=false) %}      \n{%- for message in messages %}\n    {%- for content in message[\'content\'] %}\n        {%- if content[\'type\'] == \'image\' %}\n            {%- set image_ns.has_images = true %}\n        {%- endif %}\n    {%- endfor %}\n{%- endfor %}\n\n{#- Error out if there are images and system message #}\n{%- if image_ns.has_images and not system_message == "" %}\n    {{- raise_exception("Prompting with images is incompatible with system messages.") }}\n{%- endif %}\n\n{#- System message if there are no images #}\n{%- if not image_ns.has_images %}\n    {{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n    {%- if tools is not none %}\n        {{- "Environment: ipython\\n" }}\n    {%- endif %}\n    {{- "Cutting Knowledge Date: December 2023\\n" }}\n    {{- "Today Date: " + date_string + "\\n\\n" }}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}\n        {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n        {{- "Do not use variables.\\n\\n" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- "\\n\\n" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- "<|eot_id|>" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0][\'content\']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception("Cannot put tools in the first user message when there\'s no first user message!") }}\n{%- endif %}\n    {{- \'<|start_header_id|>user<|end_header_id|>\\n\\n\' -}}\n    {{- "Given the following functions, please respond with a JSON for a function call " }}\n    {{- "with its proper arguments that best answers the given prompt.\\n\\n" }}\n    {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n    {{- "Do not use variables.\\n\\n" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- "\\n\\n" }}\n    {%- endfor %}\n    {{- first_user_message + "<|eot_id|>"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == \'ipython\' or message.role == \'tool\' or \'tool_calls\' in message) %}\n    {{- \'<|start_header_id|>\' + message[\'role\'] + \'<|end_header_id|>\\n\\n\' }}\n        {%- if message[\'content\'] is string %}\n            {{- message[\'content\'] }}\n        {%- else %}\n            {%- for content in message[\'content\'] %}\n                {%- if content[\'type\'] == \'image\' %}\n                    {{- \'<|image|>\' }}\n                {%- elif content[\'type\'] == \'text\' %}\n                    {{- content[\'text\'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \'<|eot_id|>\' }}\n    {%- elif \'tool_calls\' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception("This model only supports single tool-calls at once!") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n        {{- \'{"name": "\' + tool_call.name + \'", \' }}\n        {{- \'"parameters": \' }}\n        {{- tool_call.arguments | tojson }}\n        {{- "}" }}\n        {{- "<|eot_id|>" }}\n    {%- elif message.role == "tool" or message.role == "ipython" %}\n        {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- "<|eot_id|>" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' }}\n{%- endif %}\n'
+    """
diff --git a/nemo/collections/multimodal/data/energon/task_encoder.py b/nemo/collections/multimodal/data/energon/task_encoder.py
index 5989ecad879be..23758b3a43dbf 100644
--- a/nemo/collections/multimodal/data/energon/task_encoder.py
+++ b/nemo/collections/multimodal/data/energon/task_encoder.py
@@ -62,7 +62,7 @@ def __init__(self, tokenizer, image_processor, multimodal_sample_config):
         image_processor (ImageProcessor): The image processor used for preprocessing images across different sample types.
         multimodal_sample_config (MultiModalSampleConfig): Configuration object for multimodal samples, including tokens and placeholders.
         """
-
+        self.tokenizer = tokenizer
         self.encoders: Dict[str, SampleEncoder] = {
             VQASample.__name__: VQASampleEncoder(
                 tokenizer=tokenizer,
diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py
index 2aeeae299a7d0..7d8cc2c942477 100644
--- a/nemo/collections/vlm/__init__.py
+++ b/nemo/collections/vlm/__init__.py
@@ -1,28 +1,56 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.mllama.data import MLlamaLazyDataModule, MLlamaMockDataModule
+from nemo.collections.vlm.mllama.model.base import (
+    CrossAttentionTextConfig,
+    CrossAttentionVisionConfig,
+    MLlamaModel,
+    MLlamaModelConfig,
+)
+from nemo.collections.vlm.mllama.model.mllama import (
+    MLlamaConfig11B,
+    MLlamaConfig11BInstruct,
+    MLlamaConfig90B,
+    MLlamaConfig90BInstruct,
+)
 from nemo.collections.vlm.neva.data import (
     DataConfig,
     ImageDataConfig,
     ImageToken,
-    MockDataModule,
     MultiModalToken,
     NevaLazyDataModule,
+    NevaMockDataModule,
     VideoDataConfig,
     VideoToken,
 )
-from nemo.collections.vlm.neva.model import (
+from nemo.collections.vlm.neva.model.base import (
     CLIPViTConfig,
     HFCLIPVisionConfig,
-    Llava1_5Config7B,
-    Llava1_5Config13B,
-    LlavaConfig,
-    LlavaModel,
     MultimodalProjectorConfig,
     NevaConfig,
     NevaModel,
 )
+from nemo.collections.vlm.neva.model.llava import Llava1_5Config7B, Llava1_5Config13B, LlavaConfig, LlavaModel
+from nemo.collections.vlm.peft import LoRA
+from nemo.collections.vlm.recipes import *
 
 __all__ = [
-    "MockDataModule",
+    "NevaMockDataModule",
     "NevaLazyDataModule",
+    "MLlamaMockDataModule",
+    "MLlamaLazyDataModule",
     "DataConfig",
     "ImageDataConfig",
     "VideoDataConfig",
@@ -38,4 +66,14 @@
     "Llava1_5Config7B",
     "Llava1_5Config13B",
     "LlavaModel",
+    "MLlamaModel",
+    "MLlamaModelConfig",
+    "CrossAttentionTextConfig",
+    "CrossAttentionVisionConfig",
+    "MLlamaConfig11B",
+    "MLlamaConfig11BInstruct",
+    "MLlamaConfig90B",
+    "MLlamaConfig90BInstruct",
+    "mllama_11b",
+    "mllama_90b",
 ]
diff --git a/nemo/collections/vlm/mllama/__init__.py b/nemo/collections/vlm/mllama/__init__.py
new file mode 100644
index 0000000000000..94a1021ca0f8e
--- /dev/null
+++ b/nemo/collections/vlm/mllama/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers import PreTrainedTokenizerFast
+from nemo.lightning.io import track_io
+
+track_io(PreTrainedTokenizerFast)
diff --git a/nemo/collections/vlm/mllama/data/__init__.py b/nemo/collections/vlm/mllama/data/__init__.py
new file mode 100644
index 0000000000000..0e89762a4c9ab
--- /dev/null
+++ b/nemo/collections/vlm/mllama/data/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule
+from nemo.collections.vlm.mllama.data.mock import MockDataModule as MLlamaMockDataModule
+
+__all__ = [
+    "MLlamaMockDataModule",
+    "MLlamaLazyDataModule",
+]
diff --git a/nemo/collections/vlm/mllama/data/lazy.py b/nemo/collections/vlm/mllama/data/lazy.py
new file mode 100644
index 0000000000000..30b8b2ea9d9c2
--- /dev/null
+++ b/nemo/collections/vlm/mllama/data/lazy.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import re
+from typing import Any, Dict, List, Optional, Sequence
+
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+from torch.utils.data import DataLoader, default_collate
+
+from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.collections.vlm.mllama.model.utils import create_vision_mask_tensor
+from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig
+from nemo.collections.vlm.neva.data.lazy import IGNORE_INDEX, LazySupervisedDataset
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+
+class MLlamaDataset(LazySupervisedDataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        data_path,
+        data_config,
+        tokenizer,
+        image_processor,
+        sequence_length,
+    ):
+
+        if data_path.endswith(".json"):
+            super().__init__(data_path, data_config, tokenizer, image_processor, sequence_length)
+
+        elif data_path.endswith(".jsonl"):
+            super().__init__(None, data_config, tokenizer, image_processor, sequence_length)
+            logging.warning("Loading image inputs from SteerLM Dataset...")
+            if data_config.media_type == 'image':
+                image_folder = data_config.image_folder
+                for line in open(data_path, "r"):
+                    record = json.loads(line)
+
+                    # This currently supports only a single image
+                    # search for <img src="/absolute/path/to/image" in the conversation
+                    #   add it as record['image'], remove src tag from the <img> tag
+
+                    record['image'] = []
+                    for turn in record['conversations']:
+                        matches = re.finditer(r'<img src=["\']([^"\']+)["\']', turn['value'])
+                        for match in matches:
+                            image_name = match.group(1).split("/")[-1]
+                            image_path = os.path.join(image_folder, image_name)
+                            if not os.path.isfile(image_path):
+                                logging.warning(f"Image not found: {image_path}")
+                                continue
+                            record['image'].append(image_name)  # url
+                        turn['value'] = re.sub('<img src=["\']([^"\']+)["\']', "<image>", turn['value'])
+
+                    self.list_data_dict.append(record)
+
+        else:
+            raise ValueError(f"Formatting of {data_path} is not supported in MLlama.")
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        source = self.list_data_dict[i]
+        conversations = self._apply_prompt_templates(source, use_plain=self.conv_template == "plain")
+        conversations = conversations.replace("<image>", "<|image|>")
+        tokens, labels = self._tokenize_and_label(conversations)
+
+        image_dict = self._process_images(source)
+        data_dict = dict(
+            **image_dict,
+            tokens=tokens,
+            labels=labels,
+        )
+        return data_dict
+
+    def _process_images(self, source):
+        images = []
+        if 'image' in source:
+            if not isinstance(source['image'], list):
+                source['image'] = [source['image']]
+            for image_file in source['image']:
+                image = self.image_loader.open_image(image_file)
+                if image is None:
+                    logging.warning(f"Image {image_file} could not be found!")
+                images.append(image)
+
+        if len(images) > 0:
+            image_dict = self.image_processor.preprocess(images, return_tensors='pt')
+            image_dict = {
+                k: v[0] for k, v in image_dict.items() if k in ["pixel_values", "aspect_ratio_ids", "num_tiles"]
+            }  # remove batch dim
+        else:
+            image_dict = dict(
+                pixel_values=torch.zeros(
+                    1, 4, 3, self.image_processor.size['height'], self.image_processor.size['width']
+                ),
+                aspect_ratio_ids=torch.tensor([0], dtype=torch.long),
+                num_tiles=[0],
+            )
+
+        return image_dict
+
+    def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        data_config = self.data_config
+        max_len = (max(instance['tokens'].shape[0] for instance in instances) - 1) // 64 * 64 + 64
+        if max_len > self.sequence_length:
+            logging.warning(f"Truncating sequence length {max_len} to {self.seq_length}.")
+            max_len = self.sequence_length
+        max_num_concurrent_media = max(instance['pixel_values'].shape[0] for instance in instances)
+        for instance in instances:
+            pad_len = max_len - instance['tokens'].shape[0]
+            instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
+            instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX)
+            pad_num_images = max_num_concurrent_media - instance['pixel_values'].shape[0]
+            instance['pixel_values'] = F.pad(
+                instance['pixel_values'], (0, 0, 0, 0, 0, 0, 0, 0, 0, pad_num_images), 'constant', 0
+            )
+            instance['aspect_ratio_ids'] = F.pad(
+                instance['aspect_ratio_ids'], (0, max(pad_num_images - 1, 0)), 'constant', 0
+            )
+            instance['num_tiles'] = F.pad(
+                torch.tensor(instance['num_tiles']), (0, max(pad_num_images - 1, 0)), 'constant', 0
+            )
+
+        batch_masks = [create_vision_mask_tensor(instance['tokens'], 128256) for instance in instances]
+        batch = default_collate(instances)
+
+        tokenizer = self.tokenizer
+
+        tokens = batch['tokens']
+        labels = batch['labels']
+
+        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+            data=tokens,
+            eod_token=tokenizer.eos_token_id,
+            eod_mask_loss=data_config.eod_mask_loss,
+            reset_attention_mask=data_config.reset_attention_mask,
+            reset_position_ids=data_config.reset_position_ids,
+        )
+
+        loss_mask[labels < 0] = 0.0
+        batch = {
+            'tokens': tokens,
+            'labels': labels,
+            'batch_images': batch['pixel_values'],
+            'batch_masks': batch_masks,
+            'num_chunks': batch['num_tiles'],
+            'attention_mask': attention_mask,
+            "aspect_ratio_ids": batch['aspect_ratio_ids'],
+            'loss_mask': loss_mask,
+            'position_ids': position_ids,
+        }
+        return batch
+
+
+class MLlamaLazyDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        paths: str | List[str],
+        weights: Optional[List[float]] = None,
+        data_config: Optional[DataConfig] = ImageDataConfig,
+        seq_length: int = 2048,
+        decoder_seq_length: Optional[int] = None,
+        tokenizer: Optional = None,
+        image_processor: Optional = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        use_packed_sequence: bool = False,
+        seed: int = 1234,
+    ) -> None:
+        super().__init__()
+        if not isinstance(paths, (list, tuple)):
+            paths = [paths]
+        if weights is not None:
+            assert len(weights) == len(paths)
+            if len(weights) == 1:
+                # weights must be None if there is only one dataset
+                weights = None
+
+        self.paths = paths
+        self.weights = weights
+        self.data_config = data_config
+        self.seq_length = seq_length
+        self.decoder_seq_length = decoder_seq_length
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.seed = seed
+        self.use_packed_sequence = use_packed_sequence
+        self.init_global_step = 0
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+
+        if tokenizer is None or image_processor is None:
+            logging.warning(
+                f"Processor and tokenizer are not provided! Fall back to `meta-llama/Llama-3.2-11B-Vision-Instruct`."
+            )
+            from transformers import AutoProcessor
+
+            processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct")
+            self.tokenizer = tokenizer or processor.tokenizer
+            self.image_processor = image_processor or processor.image_processor
+
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            decoder_seq_len=self.decoder_seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            dataloader_type="cyclic",
+        )
+
+    def setup(self, stage: str = "") -> None:
+        assert len(self.paths) == 1, "not yet support blend dataset in MLlama 2.0!"
+        if self.use_packed_sequence:
+            pass  # TODO
+        else:
+            # TODO:
+            # rng = torch.Generator().manual_seed(self.seed)
+            # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng)
+            self._train_ds = MLlamaDataset(
+                self.paths[0], self.data_config, self.tokenizer, self.image_processor, self.seq_length
+            )
+            self._validation_ds = MLlamaDataset(
+                self.paths[0], self.data_config, self.tokenizer, self.image_processor, self.seq_length
+            )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        self.init_global_step = self.trainer.global_step
+        self.data_sampler.init_global_step = self.init_global_step
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=getattr(dataset, 'collate_fn', data.dataloader.default_collate),
+            **kwargs,
+        )
+
+    def state_dict(self) -> Dict[str, Any]:
+        """Called when saving a checkpoint, implement to generate and save datamodule state.
+
+        Returns:
+            A dictionary containing datamodule state.
+
+        """
+        consumed_samples = self.data_sampler.compute_consumed_samples(self.trainer.global_step - self.init_global_step)
+        return {'consumed_samples': consumed_samples}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """Called when loading a checkpoint, implement to reload datamodule state given datamodule stat
+
+        Args:
+            state_dict: the datamodule state returned by ``state_dict``.
+
+        """
+        try:
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        except ModuleNotFoundError:
+            from nemo.lightning.apex_utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        consumed_samples = state_dict['consumed_samples']
+        self.data_sampler.init_consumed_samples = consumed_samples
+        self.data_sampler.prev_consumed_samples = consumed_samples
+        self.if_first_step = 1
+
+        if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is not None:
+            num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR  # noqa: SLF001
+
+            num_microbatch_calculator.update(
+                consumed_samples=consumed_samples,
+                consistency_check=False,
+            )
diff --git a/nemo/collections/vlm/mllama/data/mock.py b/nemo/collections/vlm/mllama/data/mock.py
new file mode 100644
index 0000000000000..bb3afe83ea46d
--- /dev/null
+++ b/nemo/collections/vlm/mllama/data/mock.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+from torch.utils.data import DataLoader, Dataset
+
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+
+class MockDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        seq_length: int = 2048,
+        decoder_seq_length: Optional = None,
+        vocab_size: int = 128256,
+        crop_size: Tuple[int, int] = (560, 560),
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        super().__init__()
+        self.seq_length = seq_length
+        self.decoder_seq_length = decoder_seq_length
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.vocab_size = vocab_size
+        self.crop_size = crop_size
+
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            decoder_seq_len=self.decoder_seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        self._train_ds = _MockMLlamaDataset(
+            self.vocab_size, self.crop_size, "train", self.num_train_samples, self.decoder_seq_length
+        )
+        self._validation_ds = _MockMLlamaDataset(
+            self.vocab_size, self.crop_size, "valid", self.num_val_samples, self.decoder_seq_length
+        )
+        self._test_ds = _MockMLlamaDataset(
+            self.vocab_size, self.crop_size, "test", self.num_test_samples, self.decoder_seq_length
+        )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        if not hasattr(self, "_train_ds"):
+            self.setup()
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        if not hasattr(self, "_validation_ds"):
+            self.setup()
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        if not hasattr(self, "_test_ds"):
+            self.setup()
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
+
+
+class _MockMLlamaDataset(Dataset):
+    def __init__(
+        self,
+        vocab_size,
+        crop_size,
+        name: str,
+        num_samples: int,
+        seq_length: int,
+        seed: int = 42,
+    ) -> None:
+        super().__init__()
+        self.name = name
+        self.seq_length = seq_length
+
+        self.vocab_size = vocab_size
+
+        self.image_height, self.image_width = crop_size
+
+        self.length = num_samples
+        self.seed = seed
+
+        self.loss_mask = torch.ones(self.seq_length, dtype=torch.float)
+        self.position_ids = torch.arange(self.seq_length, dtype=torch.int64)
+
+    def __len__(self) -> int:
+        return self.length
+
+    def _get_text(self, idx: int) -> np.ndarray:
+        np_gen = np.random.default_rng(seed=(self.seed + idx))
+        return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)
+
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        # Generate data of the expected size and datatype (based on GPTDataset).
+        np_gen = np.random.default_rng(seed=(self.seed + idx))
+        tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64))
+        images = torch.from_numpy(np_gen.standard_normal((1, 4, 3, self.image_height, self.image_width)))
+        aspect_ratio_ids = torch.from_numpy(np_gen.integers(8, size=[1], dtype=np.int64)) + 1
+
+        labels = tokens.clone()
+        tokens = tokens[:-1]
+        labels = labels[1:]
+
+        return {
+            "images": images,
+            "masks": [[5, 512]],
+            "num_chunks": [4],
+            "tokens": tokens,
+            "aspect_ratio_ids": aspect_ratio_ids,
+            "loss_mask": self.loss_mask,
+            "position_ids": self.position_ids,
+            "labels": labels,
+        }
+
+    def _collate_fn(self, batch):
+        """
+        A default implementation of a collation function.
+        Users should override this method to define custom data loaders.
+        """
+        collated_batch = {}
+        collated_batch["batch_masks"] = [sample.pop("masks") for sample in batch]
+        collated_batch["attention_mask"] = None
+        collated_batch.update(data.dataloader.default_collate(batch))
+        collated_batch["batch_images"] = collated_batch.pop("images")
+        return collated_batch
+
+    def collate_fn(self, batch):
+        """Method that user pass as functor to DataLoader.
+
+        The method optionally performs neural type checking and add types to the outputs.
+
+        Please note, subclasses of Dataset should not implement `input_types`.
+
+        # Usage:
+        dataloader = torch.utils.data.DataLoader(
+                ....,
+                collate_fn=dataset.collate_fn,
+                ....
+        )
+
+        Returns
+        -------
+            Collated batch, with or without types.
+        """
+        return self._collate_fn(batch)
diff --git a/nemo/collections/vlm/mllama/data/sample_encoder.py b/nemo/collections/vlm/mllama/data/sample_encoder.py
new file mode 100644
index 0000000000000..d7bfa08978c89
--- /dev/null
+++ b/nemo/collections/vlm/mllama/data/sample_encoder.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from dataclasses import field
+from typing import Dict
+
+import torch
+from megatron.energon import VQASample
+
+from nemo.collections.multimodal.data.energon.config import ImageTextSample, MultiModalSampleConfig
+from nemo.collections.multimodal.data.energon.sample_encoder import VQASampleEncoder
+from nemo.collections.vlm.mllama.model.utils import create_vision_mask_tensor
+from nemo.utils import logging
+
+
+class LlamaImageTextSample(ImageTextSample):
+    vision_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+    aspect_ratio_ids: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+    aspect_ratio_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+    num_tiles: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+
+
+class Llama3SampleEncoder(VQASampleEncoder):
+    def __init__(self, tokenizer, image_processor, multimodal_sample_config=MultiModalSampleConfig()):
+        """
+        Initialize the VQASampleEncoder.
+
+        Parameters:
+        tokenizer (Tokenizer): The HF tokenizer used for processing text.
+        image_processor (ImageProcessor): The HF image processor used for preprocessing images.
+        multimodal_sample_config (MultiModalSampleConfig, optional): Configuration object for multimodal samples.
+            Defaults to MultiModalSampleConfig().
+        """
+        super().__init__(tokenizer, image_processor, multimodal_sample_config)
+        self.conversation_template_config = multimodal_sample_config.conversation_template_config
+
+    def process_image(self, image) -> Dict[str, torch.Tensor]:
+        image_dict = self.image_processor.preprocess(image, return_tensors='pt', do_rescale=False)
+        return image_dict
+
+    def apply_prompt_template(self, input_text: VQASample, use_plain=False):
+        if self.conversation_template_config.chat_template:
+            self.tokenizer.chat_template = self.conversation_template_config.chat_template
+        elif self.tokenizer.chat_template is None:
+            raise ValueError(
+                "Both tokenizer and conversation template does not have chat template defined. Refer to "
+                "https://huggingface.co/docs/transformers/main/en/chat_templating "
+            )
+        logging.debug(f"apply_conversation_template context {input_text.context} answer {input_text.answers}")
+
+        messages = []
+        if self.conversation_template_config.system:
+            messages.append(
+                {'role': 'system', 'content': [{'type': 'text', 'text': self.conversation_template_config.system}]}
+            )
+
+        if isinstance(input_text.context, list) and isinstance(input_text.answers, list):
+            # Ensure both lists are the same length or adjust based on your specific needs
+            min_length = min(len(input_text.context), len(input_text.answers))
+            for i in range(min_length):
+                messages.append(
+                    {
+                        'role': self.conversation_template_config.roles[0],
+                        'content': [{'type': 'text', 'text': input_text.context[i]}],
+                    }
+                )
+                messages.append(
+                    {
+                        'role': self.conversation_template_config.roles[1],
+                        'content': [{'type': 'text', 'text': input_text.answers[i]}],
+                    }
+                )
+        elif isinstance(input_text.context, str) and isinstance(input_text.answers, str):
+            # Handle single context and answer as strings
+            messages.append(
+                {
+                    'role': self.conversation_template_config.roles[0],
+                    'content': [{'type': 'text', 'text': input_text.context}],
+                }
+            )
+            messages.append(
+                {
+                    'role': self.conversation_template_config.roles[1],
+                    'content': [{'type': 'text', 'text': input_text.answers}],
+                }
+            )
+        else:
+            raise ValueError(
+                f"VQA Sample context/answers should either be a List[str] or str. Other types not supported"
+            )
+
+        templated_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+        logging.debug(f"apply prompt template templated_prompt {templated_prompt}")
+        return templated_prompt
+
+    def tokenize(self, prompt: str) -> torch.Tensor:
+        regex_pattern = '(' + '|'.join(re.escape(token) for token in [self.image_token.token_str]) + ')'
+        chunks = re.split(regex_pattern, prompt)
+        # Tokenize each chunk and replace special tokens with their indices
+        tokenized_chunks = []
+        for chunk in chunks:
+            if chunk == self.image_token.token_str:
+                tokenized_chunks.append(self.image_token.token_id)
+            elif len(chunk) > 0:
+                tokenized_chunks.extend(self.tokenizer(chunk, add_special_tokens=False).input_ids)
+
+        return torch.tensor(tokenized_chunks, dtype=torch.long)
+
+    def encode(self, input_sample: VQASample, output_sample: LlamaImageTextSample):
+        conversation_prompt = self.apply_prompt_template(input_sample)
+        logging.debug(f"[Energon] task encoder encode_sample conversation_prompt {conversation_prompt}")
+        # tokenize prompt
+        tokens = self.tokenize(conversation_prompt)
+        labels = self.compute_labels(tokens, input_sample)
+
+        tokens = tokens[:-1].contiguous()
+        labels = labels[1:].contiguous()
+        logging.debug(f"[Energon] task encoder encode_sample after tokenize prompt tokens {tokens}")
+        logging.debug(f"[Energon] task encoder encode_sample labels {labels}")
+        loss_mask = self.compute_loss_mask(labels)
+        vision_mask = create_vision_mask_tensor(tokens=tokens, vision_token_id=self.image_token.token_id)
+        processed_image_dict = self.process_image(input_sample.image)
+        output_sample.__key__ = input_sample.__key__
+        output_sample.images = processed_image_dict['pixel_values'][0]
+        output_sample.aspect_ratio_ids = processed_image_dict['aspect_ratio_ids'][0]
+        output_sample.aspect_ratio_mask = processed_image_dict['aspect_ratio_mask'][0]
+        output_sample.num_tiles = processed_image_dict['num_tiles'][0]
+        output_sample.tokens = tokens
+        output_sample.labels = labels
+        output_sample.loss_mask = loss_mask
+        output_sample.vision_mask = vision_mask
+        return output_sample
diff --git a/nemo/collections/vlm/mllama/data/task_encoder.py b/nemo/collections/vlm/mllama/data/task_encoder.py
new file mode 100644
index 0000000000000..a7dcd3c8fb2cd
--- /dev/null
+++ b/nemo/collections/vlm/mllama/data/task_encoder.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+import torch
+import torch.nn.functional as F
+from megatron.energon import VQASample, batch_list, batch_pad_stack
+from torch.nn.utils.rnn import pad_sequence
+
+from nemo.collections.multimodal.data.energon.sample_encoder import SampleEncoder
+from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder
+from nemo.collections.vlm.mllama.data.sample_encoder import Llama3SampleEncoder, LlamaImageTextSample
+
+
+def pad_or_truncate(sequence_batch, seq_length: int, padding_value: int):
+    # Pad the sequence if it's shorter than seq_length
+    if sequence_batch.size(1) < seq_length:
+        pad_size = seq_length - sequence_batch.size(1)
+        sequence_batch = F.pad(sequence_batch, (0, pad_size), value=padding_value)
+    else:
+        # Truncate the sequence if it's longer than seq_length
+        sequence_batch = sequence_batch[:, :seq_length]
+
+    return sequence_batch
+
+
+@dataclass
+class LlamaImageTextRawBatch:
+    __keys__: List[str] = field(default_factory=list)
+
+    tokens: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.long))
+    labels: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.long))
+    loss_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+
+    batch_images: torch.Tensor = field(default_factory=lambda: torch.empty(0))
+    batch_masks: torch.Tensor = field(default_factory=lambda: torch.empty(0))
+
+    aspect_ratio_ids: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+    aspect_ratio_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+    num_chunks: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+
+
+class LlamaTaskEncoder(MultiModalTaskEncoder):
+    def __init__(self, tokenizer, image_processor, multimodal_sample_config, seq_length=None):
+        super().__init__(tokenizer, image_processor, multimodal_sample_config)
+        self.encoders: Dict[str, SampleEncoder] = {
+            VQASample.__name__: Llama3SampleEncoder(tokenizer, image_processor, multimodal_sample_config)
+        }
+        self.seq_length = seq_length
+        self.ignore_index = multimodal_sample_config.ignore_place_holder
+
+    def batch(self, samples: List[LlamaImageTextSample]) -> LlamaImageTextRawBatch:
+
+        keys, images, tokens, labels, loss_mask, vision_mask = [], [], [], [], [], []
+        aspect_ratio_ids, aspect_ratio_mask, num_tiles = [], [], []
+        for sample in samples:
+            keys.append(sample.__key__)
+            images.append(sample.images)
+            tokens.append(sample.tokens)
+            labels.append(sample.labels)
+            loss_mask.append(sample.loss_mask)
+            vision_mask.append(sample.vision_mask)
+            aspect_ratio_ids.append(sample.aspect_ratio_ids)
+            aspect_ratio_mask.append(sample.aspect_ratio_mask)
+            num_tiles.append(sample.num_tiles)
+
+        batch_keys = batch_list(keys)
+        batch_images = batch_pad_stack(images)
+
+        batch_tokens = pad_sequence(tokens, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        batch_labels = pad_sequence(labels, batch_first=True, padding_value=self.ignore_index)
+        batch_loss_mask = batch_pad_stack(loss_mask)
+        if self.seq_length is not None:
+            seq_length = self.seq_length
+        else:
+            seq_length = (batch_tokens.size(1) - 1) // 64 * 64 + 64
+        batch_tokens = pad_or_truncate(batch_tokens, seq_length, self.tokenizer.pad_token_id)
+        batch_labels = pad_or_truncate(batch_labels, seq_length, self.ignore_index)
+        batch_loss_mask = pad_or_truncate(batch_loss_mask, seq_length, 0)
+        assert batch_loss_mask.sum() > 0, "This batch has nothing to predict! Will trigger a nan loss."
+        batch_vision_mask = batch_pad_stack(vision_mask)
+        batch_aspect_ratio_ids = batch_pad_stack(aspect_ratio_ids)
+        batch_aspect_ratio_mask = batch_pad_stack(aspect_ratio_mask)
+        batch_num_tiles = torch.tensor(num_tiles)
+        return LlamaImageTextRawBatch(
+            __keys__=batch_keys,
+            batch_images=batch_images,
+            batch_masks=batch_vision_mask,
+            tokens=batch_tokens,
+            labels=batch_labels,
+            loss_mask=batch_loss_mask,
+            aspect_ratio_ids=batch_aspect_ratio_ids,
+            aspect_ratio_mask=batch_aspect_ratio_mask,
+            num_chunks=batch_num_tiles,
+        )
diff --git a/nemo/collections/vlm/mllama/model/__init__.py b/nemo/collections/vlm/mllama/model/__init__.py
new file mode 100644
index 0000000000000..9eb076609f84d
--- /dev/null
+++ b/nemo/collections/vlm/mllama/model/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.mllama.model.base import (
+    CrossAttentionTextConfig,
+    CrossAttentionVisionConfig,
+    MLlamaModel,
+    MLlamaModelConfig,
+)
diff --git a/nemo/collections/vlm/mllama/model/base.py b/nemo/collections/vlm/mllama/model/base.py
new file mode 100644
index 0000000000000..f03af078987d7
--- /dev/null
+++ b/nemo/collections/vlm/mllama/model/base.py
@@ -0,0 +1,606 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple
+
+import pytorch_lightning as L
+import torch
+import torch.distributed
+from einops import rearrange
+from megatron.core.enums import ModelType
+from megatron.core.models.vision.multimodal_projector import MultimodalProjector
+from megatron.core.optimizer import OptimizerConfig
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear
+from megatron.core.transformer import MegatronModule
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from PIL import Image as PIL_Image
+from torch import nn
+
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.collections.llm import fn
+from nemo.collections.llm.gpt.model import local_layer_spec, transformer_engine_layer_spec
+from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank, get_packed_seq_params
+from nemo.collections.llm.gpt.model.llama import Llama31Config, apply_rope_scaling
+from nemo.collections.vlm.mllama.model.language import CrossAttentionTextModel
+from nemo.collections.vlm.mllama.model.utils import _generate_cross_attention_mask, _pad_attention_masks
+from nemo.collections.vlm.mllama.model.vision import VisionEncoder
+from nemo.lightning import get_vocab_size, io
+from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
+from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
+from nemo.utils import logging
+
+
+def llama_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+    from megatron.core import parallel_state
+
+    # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
+    # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L828-L842
+
+    batch = next(dataloader_iter)
+
+    _batch: dict
+    if isinstance(batch, tuple) and len(batch) == 3:
+        _batch = batch[0]
+    else:
+        _batch = batch
+
+    required_keys = set()
+    required_keys.update(
+        (
+            "attention_mask",
+            "tokens",
+            "batch_masks",
+            "position_ids",
+            "num_chunks",
+        )
+    )
+    if parallel_state.is_pipeline_first_stage():
+        required_keys.update(
+            (
+                "batch_images",
+                "aspect_ratio_ids",
+            )
+        )
+    if parallel_state.is_pipeline_last_stage():
+        required_keys.update(
+            (
+                "labels",
+                "loss_mask",
+            )
+        )
+
+    _batch = {
+        key: val.cuda(non_blocking=True) if key in required_keys and isinstance(val, torch.Tensor) else val
+        for key, val in _batch.items()
+    }
+    # slice batch along sequence dimension for context parallelism
+    output = get_batch_on_this_context_parallel_rank(_batch)
+
+    return output
+
+
+def llama_forward_step(model, batch) -> torch.Tensor:
+    forward_config = {
+        "batch_images": batch["batch_images"],
+        "batch_masks": batch["batch_masks"],
+        "tokens": batch["tokens"],
+        "position_ids": batch["position_ids"],
+        "aspect_ratio_ids": batch["aspect_ratio_ids"],
+        "num_chunks": batch["num_chunks"],
+        "labels": batch.get("labels", None),
+    }
+
+    if 'cu_seqlens' in batch:
+        forward_config['packed_seq_params'] = get_packed_seq_params(batch)
+
+    return model(**forward_config)
+
+
+def set_input_tensor(self, tensor):
+    pass
+
+
+@dataclass
+class CrossAttentionVisionConfig(TransformerConfig, io.IOMixin):
+    # core params
+
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
+    # vision model params
+    num_layers: int = 32
+    hidden_size: int = 1280
+    num_attention_heads: int = 16
+    vision_chunk_size: int = -1  # image resolution for image models
+    vision_max_num_chunks: int = 4
+    num_global_layers: int = 8
+    max_num_tiles: int = 4
+    text_hidden_size: int = 4096
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    ffn_dropout: float = 0.0
+    gated: bool = False
+    supported_aspect_ratios: Tuple[Tuple[int, int], ...] = (
+        (1, 1),
+        (1, 2),
+        (1, 3),
+        (1, 4),
+        (2, 1),
+        (2, 2),
+        (3, 1),
+        (4, 1),
+    )
+
+    @property
+    def max_aspect_ratio_id(self) -> int:
+        return len(self.supported_aspect_ratios)
+
+    def configure_model(self) -> "CrossAttentionVisionModel":
+        return CrossAttentionVisionModel(
+            self,
+        )
+
+
+@dataclass
+class CrossAttentionTextConfig(Llama31Config):
+    rotary_base: int = 500_000
+    seq_length: int = 8192
+    num_layers: int = 32
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 14336
+    num_attention_heads: int = 32
+    num_cross_attention_layers: int = 8
+    vocab_size: int = 128256
+    apply_rope_fusion: bool = False
+
+    def _init_fusion_schedule(self, num_layers: int) -> List[int]:
+        llama_layers = list(range(self.num_layers))
+        # uniformly spread the layers
+        k = math.ceil(len(llama_layers) / num_layers)
+        return llama_layers[::-1][::k][:num_layers][::-1]
+
+    def configure_model(self, tokenizer, pre_process=True, post_process=True):
+        self.fusion_schedule = self._init_fusion_schedule(self.num_cross_attention_layers)
+        vp_size = self.virtual_pipeline_model_parallel_size
+        if vp_size:
+            p_size = self.pipeline_model_parallel_size
+            assert (
+                self.num_layers // p_size
+            ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
+
+        transformer_layer_spec = self.transformer_layer_spec
+        if not isinstance(transformer_layer_spec, ModuleSpec):
+            transformer_layer_spec = transformer_layer_spec(self)
+
+        if hasattr(self, 'vocab_size'):
+            vocab_size = self.vocab_size
+            logging.info(
+                f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
+                f" {vocab_size - tokenizer.vocab_size}."
+            )
+        else:
+            vocab_size = get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by)
+
+        model = CrossAttentionTextModel(
+            self,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=vocab_size,
+            max_sequence_length=self.seq_length,
+            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+            parallel_output=self.parallel_output,
+            share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
+            position_embedding_type=self.position_embedding_type,
+            rotary_percent=self.rotary_percent,
+            rotary_base=self.rotary_base,
+            seq_len_interpolation_factor=self.seq_len_interpolation_factor,
+            pre_process=pre_process,
+            post_process=post_process,
+        )
+        model.rotary_pos_emb.inv_freq = apply_rope_scaling(
+            model.rotary_pos_emb.inv_freq,
+            factor=self.scale_factor,
+            low_freq_factor=self.low_freq_factor,
+            high_freq_factor=self.high_freq_factor,
+            old_context_len=self.old_context_len,
+        )
+        return model
+
+
+@dataclass
+class MLlamaModelConfig(TransformerConfig, io.IOMixin):
+    language_model_config: Optional[CrossAttentionTextConfig] = None
+    vision_model_config: Optional[CrossAttentionVisionConfig] = None
+
+    encoder_pipeline_model_parallel_size: int = 0
+    encoder_tensor_model_parallel_size: int = 1
+    vision_num_cross_attention_layers: int = -1
+    num_layers: int = 1  # Placeholder, NOT used!
+    num_attention_heads: int = 8  # Placeholder, NOT used!
+
+    language_model_from_pretrained: Optional[str] = None  # TODO
+    vision_model_from_pretrained: Optional[str] = None  # TODO
+
+    forward_step_fn: Callable = llama_forward_step
+    data_step_fn: Callable = llama_data_step
+
+    def __post_init__(self):
+        model_config_attr = [
+            'num_layers',
+            'hidden_size',
+            'num_attention_heads',
+            'num_query_groups',
+            'ffn_hidden_size',
+            'kv_channels',
+            'hidden_dropout',
+            'attention_dropout',
+            'fp32_residual_connection',
+            'apply_residual_connection_post_layernorm',
+            'layernorm_epsilon',
+            'layernorm_zero_centered_gamma',
+            'add_bias_linear',
+            'add_qkv_bias',
+            'gated_linear_unit',
+            'activation_func',
+            'activation_func_fp8_input_store',
+            'num_moe_experts',
+            'rotary_interleaved',
+            'window_size',
+            'normalization',
+            'qk_layernorm',
+            'test_mode',
+            'calculate_per_token_loss',
+        ]
+
+        if self.language_model_config is not None:
+            for attr in model_config_attr:
+                setattr(self, attr, getattr(self.language_model_config, attr))
+
+    def configure_model(self, tokenizer) -> "MLlamaBaseModel":
+        from megatron.core import parallel_state as ps
+
+        self.language_model_config.tensor_model_parallel_size = self.tensor_model_parallel_size
+        self.vision_model_config.tensor_model_parallel_size = self.tensor_model_parallel_size
+        self.language_model_config.pipeline_model_parallel_size = self.pipeline_model_parallel_size
+
+        if self.encoder_pipeline_model_parallel_size > 0:
+            assert self.encoder_pipeline_model_parallel_size == 1, "ViT can only live on 1 pipeline stage."
+            self.vision_model_config.pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size
+            self.language_model_config.encoder_pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size
+            if self.encoder_tensor_model_parallel_size > 0:
+                self.vision_model_config.tensor_model_parallel_size = self.encoder_tensor_model_parallel_size
+
+        model = MLlamaBaseModel(
+            config=self,
+            tokenizer=tokenizer,
+            pre_process=ps.is_pipeline_first_stage()
+            or ps.get_pipeline_model_parallel_rank() == self.encoder_pipeline_model_parallel_size,
+            post_process=ps.is_pipeline_last_stage(),
+            add_encoder=ps.is_pipeline_first_stage(),
+            add_decoder=ps.is_pipeline_last_stage()
+            or ps.get_pipeline_model_parallel_rank() >= self.encoder_pipeline_model_parallel_size,
+        )
+
+        return model
+
+
+class CrossAttentionVisionModel(MegatronModule):
+    def __init__(self, config) -> None:
+        super().__init__(config=config)
+        return_intermediate = "3,7,15,23,30"
+        self.vision_input_dim = 1280
+        self.image_res = config.vision_chunk_size
+        self.max_num_chunks = config.vision_max_num_chunks
+        if return_intermediate is not None:
+            return_intermediate = [int(l) for l in return_intermediate.split(",")]
+            self.vision_input_dim = (len(return_intermediate) + 1) * self.vision_input_dim
+        self.patch_size = 14
+        self.vision_encoder = VisionEncoder(
+            config=config,
+            image_size=config.vision_chunk_size,
+            patch_size=self.patch_size,
+            return_intermediate=return_intermediate,
+        ).to(config.params_dtype)
+
+        projection_config = copy.deepcopy(config)
+        projection_config.hidden_size = config.text_hidden_size
+        affine_layer_spec = MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=None)
+        self.vision_projection = MultimodalProjector(
+            config=projection_config,
+            submodules=affine_layer_spec,
+            projector_type="affine",
+            input_size=self.vision_input_dim,
+        )
+        self.vision_projection.encoder.skip_bias_add = False  # Temporary fix for a MCore side bug
+
+    def forward(self, images: torch.Tensor, aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        # vision_tokens: (B, T, D)
+        # aspect_ratio_ids: (B, 1)
+        # h: (B, T, D)
+        vision_tokens = self.vision_encoder(images.to(dtype=torch.bfloat16), aspect_ratio_ids)
+        vision_shape = vision_tokens.shape
+        vision_tokens = self.vision_projection(vision_tokens.reshape(-1, *vision_shape[-2:]))
+        vision_tokens = vision_tokens.reshape(*vision_shape[:-1], -1)
+        return vision_tokens
+
+    def set_input_tensor(self, tensor):
+        pass
+
+
+class MLlamaBaseModel(MegatronModule):
+    def __init__(
+        self,
+        config: MLlamaModelConfig,
+        tokenizer: Optional = None,
+        pre_process: bool = True,
+        post_process: bool = True,
+        add_encoder: bool = True,
+        add_decoder: bool = True,
+    ) -> None:
+        super().__init__(config=config)
+
+        language_model_config = config.language_model_config
+        vision_model_config = config.vision_model_config
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        self.encoder_hidden_state = None
+        self.vision_model: Optional[CrossAttentionVisionModel] = None
+        self.language_model: Optional[CrossAttentionTextModel] = None
+
+        self.share_embeddings_and_output_weights = False
+        self.add_decoder = (language_model_config is not None) and add_decoder
+        self.add_encoder = (vision_model_config is not None) and add_encoder
+
+        if self.add_decoder:
+            self.language_model = language_model_config.configure_model(
+                tokenizer=tokenizer, pre_process=pre_process, post_process=post_process
+            )
+            self.share_embeddings_and_output_weights = self.language_model.share_embeddings_and_output_weights
+
+        if self.add_encoder:
+            self.vision_model = vision_model_config.configure_model()
+
+        self.model_type = ModelType.encoder_and_decoder
+        self.xattn_needed = True
+
+        self.patch_size = 14
+        self.image_res = vision_model_config.vision_chunk_size
+        self.max_num_chunks = vision_model_config.vision_max_num_chunks
+        logging.warning("[WARNING] NeMo Mllama will always pad images to max number of tiles. A fix is coming soon!")
+
+    def setup_cache(self, max_batch_size: int, dtype: torch.dtype):
+        self.language_model.setup_cache(max_batch_size, dtype)
+
+    def compute_xattn_caches_masks(
+        self,
+        vision_tokens: torch.Tensor,
+        vision_orig_shape: Tuple[int, int, int, int, int],
+        batch_masks: torch.Tensor,
+        num_chunks: torch.Tensor,
+        total_len: int,
+    ) -> Tuple[List, torch.Tensor, torch.Tensor]:
+        bsz, nimg, nchunk, ntok, image_token_dim = vision_orig_shape
+
+        xattn_caches = [
+            layer.compute_xattn_kv_cache(vision_tokens) for layer in self.language_model.decoder.xattn_layers
+        ]
+
+        padded_masks = _pad_attention_masks(
+            batch_masks,
+            num_chunks,
+            total_len,
+            self.max_num_chunks,
+            vision_tokens.device,
+        )
+        vision_tokens = rearrange(
+            vision_tokens, "(nimg nchk ntok) b dim -> b nimg nchk ntok dim", nimg=nimg, nchk=nchunk, ntok=ntok
+        )
+        cross_attention_masks, full_text_row_masked_out_mask = _generate_cross_attention_mask(
+            text_token_count=total_len,
+            text_device="cuda",
+            text_dtype=next(self.language_model.parameters()).dtype,
+            vision_tokens=vision_tokens,
+            cross_attention_masks=padded_masks,
+        )
+
+        return (xattn_caches, cross_attention_masks, full_text_row_masked_out_mask)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        tokens: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        batch_images: Optional[torch.Tensor] = None,
+        batch_masks: Optional[torch.Tensor] = None,
+        num_chunks: Optional[torch.Tensor] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
+        cross_attention_masks: Optional[torch.Tensor] = None,
+        full_text_row_masked_out_mask: Optional[torch.Tensor] = None,
+        xattn_caches: Optional[List] = None,
+    ) -> torch.Tensor:
+        if xattn_caches is None:
+            bsz, max_num_images = batch_images.size(0), batch_images.size(1)
+            vision_orig_shape = (
+                bsz,
+                max_num_images,
+                self.max_num_chunks,
+                int((self.image_res / self.patch_size) ** 2 + 1),
+                self.config.hidden_size,
+            )
+            skip_vision_encoder = False
+            num_chunks[num_chunks > 0] = self.max_num_chunks
+            if max_num_images == 0:
+                skip_vision_encoder = True
+
+            if self.encoder_hidden_state is not None:
+                vision_tokens = self.encoder_hidden_state
+            else:
+                if skip_vision_encoder:
+                    vision_tokens = torch.zeros(
+                        vision_orig_shape,
+                        device="cuda",
+                        dtype=torch.bfloat16,
+                    )
+                else:
+                    vision_tokens = self.vision_model(batch_images, aspect_ratio_ids)
+                    vision_tokens = rearrange(
+                        vision_tokens, "b nimg nchk ntok dim -> (nimg nchk ntok) b dim"
+                    ).contiguous()
+
+            if not self.add_decoder:
+                return vision_tokens
+
+            xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = self.compute_xattn_caches_masks(
+                vision_tokens=vision_tokens,
+                vision_orig_shape=vision_orig_shape,
+                batch_masks=batch_masks,
+                num_chunks=num_chunks,
+                total_len=position_ids.shape[1],
+            )
+
+        assert self.add_decoder, "Language model required for forward pass."
+        language_embeddings = None
+        if self.pre_process:
+            language_embeddings = self.language_model.get_partially_trainable_embedding(tokens)
+            language_embeddings = language_embeddings.transpose(1, 0).contiguous()  # [text_seq_len, b, h_language]
+
+        full_text_row_masked_out_mask = (
+            full_text_row_masked_out_mask[:, :, position_ids[0]].permute(2, 0, 1, 3).squeeze(2)
+            if cross_attention_masks is not None
+            else None
+        )
+        output = self.language_model(
+            input_ids=tokens,
+            position_ids=position_ids,
+            labels=labels,
+            decoder_input=language_embeddings,
+            attention_mask=None,
+            cross_attention_masks=(
+                cross_attention_masks[:, :, position_ids[0]] if cross_attention_masks is not None else None
+            ),
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            xattn_caches=xattn_caches,
+        )
+        return output
+
+    def set_input_tensor(self, input_tensor) -> None:
+        """Set model chunk input tensor."""
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        if self.add_encoder:
+            self.vision_model.set_input_tensor(input_tensor[0])
+        elif self.add_decoder and self.pre_process:
+            self.encoder_hidden_state = input_tensor[0]
+        else:
+            assert len(input_tensor) == 2, 'input_tensor should contain encoder output.'
+            self.language_model.set_input_tensor(input_tensor[0])
+            self.encoder_hidden_state = input_tensor[1]
+
+
+class MLlamaModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
+    def __init__(
+        self,
+        config: MLlamaModelConfig,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
+        self.optim.connect(self)  # This will bind the `configure_optimizers` method
+        self.model_transform = model_transform
+        self._training_loss_reduction = None
+        self._validation_loss_reduction = None
+
+    def configure_model(self) -> None:
+        if not hasattr(self, "module"):
+            self.module: MLlamaBaseModel = self.config.configure_model(self.tokenizer)
+
+    def forward(
+        self,
+        batch_images: List[List[PIL_Image.Image]],
+        tokens: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        batch_masks: Optional[torch.Tensor] = None,
+        num_chunks: Optional[torch.Tensor] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        cross_attention_masks: Optional[torch.Tensor] = None,
+        full_text_row_masked_out_mask: Optional[torch.Tensor] = None,
+        xattn_caches: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        output_tensor = self.module(
+            position_ids=position_ids,
+            tokens=tokens,
+            batch_images=batch_images,
+            batch_masks=batch_masks,
+            num_chunks=num_chunks,
+            aspect_ratio_ids=aspect_ratio_ids,
+            labels=labels,
+            cross_attention_masks=cross_attention_masks,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            xattn_caches=xattn_caches,
+        )
+
+        return output_tensor
+
+    def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
+        return self.config.data_step_fn(dataloader_iter)
+
+    def forward_step(self, batch) -> torch.Tensor:
+        return self.config.forward_step_fn(self, batch)
+
+    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+        return self.forward_step(batch)
+
+    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+
+        return self.forward_step(batch)
+
+    @property
+    def training_loss_reduction(self) -> MaskedTokenLossReduction:
+        if not self._training_loss_reduction:
+            self._training_loss_reduction = MaskedTokenLossReduction()
+
+        return self._training_loss_reduction
+
+    @property
+    def validation_loss_reduction(self) -> MaskedTokenLossReduction:
+        if not self._validation_loss_reduction:
+            self._validation_loss_reduction = MaskedTokenLossReduction(validation_step=True)
+
+        return self._validation_loss_reduction
+
+
+__all__ = [
+    "MLlamaModel",
+    "MLlamaModelConfig",
+    "CrossAttentionTextConfig",
+    "CrossAttentionVisionConfig",
+    "llama_data_step",
+    "llama_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
+]
diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py
new file mode 100644
index 0000000000000..b8985e53c54ce
--- /dev/null
+++ b/nemo/collections/vlm/mllama/model/language.py
@@ -0,0 +1,722 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import List, Literal, Optional, Union
+
+import torch
+from megatron.core import InferenceParams, parallel_state, tensor_parallel
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+
+from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.attention import Attention
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.transformer.utils import sharded_state_dict_default
+from megatron.core.utils import make_viewless_tensor
+from torch import Tensor, nn
+
+from nemo.utils import logging
+
+try:
+    from megatron.core.transformer.custom_layers.transformer_engine import TEDelayedScaling, TENorm
+
+    HAVE_TE = True
+    LayerNormImpl = TENorm
+except ImportError:
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    HAVE_TE = False
+    LayerNormImpl = WrappedTorchLayerNorm
+
+
+@dataclass
+class MLlamaCrossAttentionSubmodules:
+    linear_q: Union[ModuleSpec, type] = None
+    linear_kv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    k_layernorm: Union[ModuleSpec, type] = None
+
+
+class CrossAttentionTextModel(MCoreGPTModel):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        transformer_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        rotary_base: int = 10000,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ):
+        super().__init__(
+            config,
+            transformer_layer_spec,
+            vocab_size,
+            max_sequence_length,
+            pre_process,
+            post_process,
+            fp16_lm_cross_entropy,
+            parallel_output,
+            share_embeddings_and_output_weights,
+            position_embedding_type,
+            rotary_percent,
+            rotary_base,
+            seq_len_interpolation_factor,
+        )
+
+        # Overwrite the self.decoder
+        self.decoder = CrossAttentionTransformerBlock(
+            config=self.config,
+            spec=transformer_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+
+        if self.pre_process:
+            self.learnable_embedding = tensor_parallel.VocabParallelEmbedding(
+                num_embeddings=8,
+                embedding_dim=self.config.hidden_size,
+                init_method=self.config.init_method,
+                reduce_scatter_embeddings=False,  # TODO double check this
+                config=self.config,
+            )
+
+            self.num_frozen_embeddings = self.embedding.word_embeddings.num_embeddings
+            self._thresh = self.num_frozen_embeddings - 1
+
+    def get_partially_trainable_embedding(self, x):
+        xz = torch.zeros_like(x, device=x.device)
+        oz = torch.ones_like(x, device=x.device)
+        x_orig = torch.minimum(x, torch.tensor(self._thresh, device=x.device))
+        x_new = torch.maximum(x, torch.tensor(self._thresh + 1, device=x.device)) - self.num_frozen_embeddings
+
+        mask_orig = torch.where(x >= self.num_frozen_embeddings, xz, oz).unsqueeze(-1)
+        mask_new = torch.where(x < self.num_frozen_embeddings, xz, oz).unsqueeze(-1)
+
+        x_orig = self.embedding(x_orig, None).transpose(0, 1)
+        x_new = self.learnable_embedding(x_new).type_as(x_orig)
+        return x_orig * mask_orig.type_as(x_orig) + x_new * mask_new.type_as(x_new)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        decoder_input: Tensor = None,
+        cross_attention_masks: Tensor = None,
+        full_text_row_masked_out_mask: Tensor = None,
+        xattn_caches: Optional[List] = None,
+        labels: Tensor = None,
+        inference_params: InferenceParams = None,
+        packed_seq_params: PackedSeqParams = None,
+        extra_block_kwargs: dict = None,
+    ) -> Tensor:
+
+        # Decoder embedding.
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            raise ValueError("Require: decoder_input is not None or self.pre_process is False")
+        else:
+            # intermediate stage of pipeline
+            # decoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+
+        # Rotary positional embeddings (embedding is None for PP intermediate devices)
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params,
+                self.decoder,
+                decoder_input,
+                self.config,
+                packed_seq_params=None,
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # Run decoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+            packed_seq_params=packed_seq_params,
+            cross_attention_masks=cross_attention_masks,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            xattn_caches=xattn_caches,
+            **(extra_block_kwargs or {}),
+        )
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        loss = self.compute_language_model_loss(labels, logits)
+
+        return loss
+
+
+class CrossAttentionTransformerBlock(TransformerBlock):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.fusion_schedule = [
+            x - self._get_layer_offset()
+            for x in self.config.fusion_schedule
+            if 0 <= (x - self._get_layer_offset()) < self.num_layers_per_pipeline_rank
+        ]
+        self.xattn_layers = []
+
+        for i in range(self.num_layers_per_pipeline_rank):
+            if i in self.fusion_schedule:
+                layer_spec = ModuleSpec(
+                    module=CrossAttentionTransformerLayer,
+                    submodules=TransformerLayerSubmodules(
+                        cross_attention=ModuleSpec(
+                            module=MLlamaCrossAttention,
+                            params={"attn_mask_type": AttnMaskType.arbitrary},
+                            submodules=MLlamaCrossAttentionSubmodules(
+                                linear_q=TELayerNormColumnParallelLinear,  # This wraps attention_norm before attention
+                                linear_kv=TEColumnParallelLinear,
+                                core_attention=TEDotProductAttention,
+                                linear_proj=TERowParallelLinear,
+                                q_layernorm=TENorm,
+                                k_layernorm=TENorm,
+                            ),
+                        ),
+                        cross_attn_bda=get_bias_dropout_add,
+                        pre_mlp_layernorm=IdentityOp,
+                        mlp=ModuleSpec(
+                            module=MLP,
+                            submodules=MLPSubmodules(
+                                linear_fc1=TELayerNormColumnParallelLinear,  # This wraps ffn_norm before feed_forward
+                                linear_fc2=TERowParallelLinear,
+                            ),
+                        ),
+                        mlp_bda=get_bias_dropout_add,
+                    ),
+                )
+                self.xattn_layers.append(build_module(layer_spec, config=self.config, layer_number=i + 1))
+            else:
+                self.xattn_layers.append(DummyCrossAttentionTransformerLayer(config=self.config))
+        self.xattn_layers = torch.nn.ModuleList(self.xattn_layers)
+
+        assert len(self.xattn_layers) == len(self.layers), 'Check PP implementation for cross attention layers!'
+
+    def _get_layer_offset(self):
+        encoder_pipeline_model_parallel_size = getattr(self.config, "encoder_pipeline_model_parallel_size", 0)
+        decoder_pipeline_model_parallel_rank = (
+            parallel_state.get_pipeline_model_parallel_rank() - encoder_pipeline_model_parallel_size
+        )
+        return decoder_pipeline_model_parallel_rank * self.num_layers_per_pipeline_rank
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        xattn_caches: Optional[List] = None,
+        cross_attention_masks: Tensor = None,
+        full_text_row_masked_out_mask: Tensor = None,
+        rotary_pos_emb: Tensor = None,
+        inference_params: InferenceParams = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        # hidden_states (float): [s, b, h]
+        # attention_mask (bool): [1, 1, s, s]
+
+        if not self.pre_process:
+            hidden_states = self.input_tensor
+
+        hidden_states = make_viewless_tensor(
+            inp=hidden_states,
+            requires_grad=True,
+            keep_graph=True,
+        )
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        if self.config.fp8:
+            import transformer_engine  # To keep out TE dependency when not training in fp8
+
+            if self.config.fp8 == "e4m3":
+                fp8_format = transformer_engine.common.recipe.Format.E4M3
+            elif self.config.fp8 == "hybrid":
+                fp8_format = transformer_engine.common.recipe.Format.HYBRID
+            else:
+                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
+
+            fp8_recipe = TEDelayedScaling(
+                config=self.config,
+                fp8_format=fp8_format,
+                override_linear_precision=(False, False, not self.config.fp8_wgrad),
+            )
+            fp8_group = None
+            if parallel_state.model_parallel_is_initialized():
+                fp8_group = parallel_state.get_amax_reduction_group(with_context_parallel=True)
+            fp8_context = transformer_engine.pytorch.fp8_autocast(
+                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
+            )
+        else:
+            fp8_context = nullcontext()
+
+        with rng_context and fp8_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full' and self.training:
+                raise NotImplementedError
+            else:
+                for l_no, (layer, xattn_layer) in enumerate(zip(self.layers, self.xattn_layers)):
+                    layer: TransformerLayer
+                    xattn_layer: Union[DummyCrossAttentionTransformerLayer, CrossAttentionTransformerLayer]
+                    with self.offload_context:
+                        if (len(self.cuda_graphs) == 0) or (not self.training):
+                            hidden_states, context = xattn_layer(
+                                hidden_states=hidden_states,
+                                cross_attention_masks=cross_attention_masks,
+                                xattn_cache=xattn_caches[l_no],
+                                full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+                                rotary_pos_emb=rotary_pos_emb,
+                                inference_params=inference_params,
+                                packed_seq_params=packed_seq_params,
+                            )
+                            hidden_states, context = layer(
+                                hidden_states=hidden_states,
+                                attention_mask=attention_mask,
+                                rotary_pos_emb=rotary_pos_emb,
+                                inference_params=inference_params,
+                                packed_seq_params=packed_seq_params,
+                            )
+                            # CUDA graph doesn't output context and is expected to be None
+                            assert (context is None) or (not self.config.enable_cuda_graph) or (not self.training)
+                        else:
+                            assert (len(self.cuda_graphs) > l_no) and (
+                                self.current_microbatch < len(self.cuda_graphs[l_no])
+                            )
+                            hidden_states = self.cuda_graphs[l_no][self.current_microbatch](
+                                hidden_states, is_first_microbatch=(self.current_microbatch == 0)
+                            )
+
+                    if (
+                        torch.is_grad_enabled()
+                        and self.config.cpu_offloading
+                        and self.group_prefetch_offload_commit_async is not None
+                    ):
+                        hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
+
+        # Final layer norm.
+        if self.final_layernorm is not None:
+            hidden_states = self.final_layernorm(hidden_states)
+            hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
+
+        return hidden_states
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: dict = None
+    ) -> ShardedStateDict:
+        sharded_state_dict = {}
+
+        layer_prefix = f'{prefix}layers.'
+        num_layers = self.config.num_layers
+        for layer in self.layers:
+            offset = layer._get_layer_offset()
+            global_layer_offset = layer.layer_number - 1  # self.layer_number starts at 1
+            state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock # pylint: disable=line-too-long
+            sharded_prefix = layer_prefix
+            sharded_pp_offset = [(0, global_layer_offset, num_layers)]  # PP sharding offset for ShardedTensors
+            layer_sharded_state_dict = layer.sharded_state_dict(state_dict_prefix, sharded_pp_offset, metadata)
+            replace_prefix_for_sharding(layer_sharded_state_dict, state_dict_prefix, sharded_prefix)
+            sharded_state_dict.update(layer_sharded_state_dict)
+
+        xlayer_prefix = f'{prefix}xattn_layers.'
+        for xlayer in self.xattn_layers:
+            if isinstance(xlayer, DummyCrossAttentionTransformerLayer):
+                continue
+            offset = xlayer._get_layer_offset()
+            global_layer_offset = xlayer.layer_number - 1
+            state_dict_prefix = f'{xlayer_prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock # pylint: disable=line-too-long
+            sharded_prefix = f'{xlayer_prefix}{global_layer_offset}.'
+            sharded_pp_offset = []
+            xlayer_sharded_state_dict = xlayer.sharded_state_dict(state_dict_prefix, sharded_pp_offset, metadata)
+            replace_prefix_for_sharding(xlayer_sharded_state_dict, state_dict_prefix, sharded_prefix)
+            sharded_state_dict.update(xlayer_sharded_state_dict)
+
+        # Add modules other than self.layers
+        for name, module in self.named_children():
+            if not module is self.layers and not module is self.xattn_layers:
+                sharded_state_dict.update(
+                    sharded_state_dict_default(module, f'{prefix}{name}.', sharded_offsets, metadata)
+                )
+
+        return sharded_state_dict
+
+
+class CrossAttentionTransformerLayer(TransformerLayer):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            hidden_dropout=hidden_dropout,
+        )
+
+        self.gate_attn = nn.Parameter(torch.zeros(1, dtype=self.config.params_dtype))
+        self.gate_ffn = nn.Parameter(torch.zeros(1, dtype=self.config.params_dtype))
+
+    def compute_xattn_kv_cache(self, xattn_tokens: Tensor) -> Tensor:
+        return self.cross_attention._compute_xattn_kv_cache(xattn_tokens)
+
+    def forward(
+        self,
+        hidden_states,
+        cross_attention_masks,
+        xattn_cache=None,
+        full_text_row_masked_out_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm after self-attention
+        pre_cross_attn_layernorm_output = self.pre_cross_attn_layernorm(hidden_states)
+
+        # Cross attention.
+        attention_output_with_bias = self.cross_attention(
+            pre_cross_attn_layernorm_output,
+            cross_attention_masks=cross_attention_masks,
+            xattn_cache=xattn_cache,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            inference_params=inference_params,
+        )
+
+        _gate_attn = self.gate_attn.tanh()
+        assert isinstance(
+            attention_output_with_bias, tuple
+        ), "`attention_output_with_bias` needs to be tuple for gating."
+        attention_output_with_bias = tuple(
+            _gate_attn * output if output is not None else None for output in attention_output_with_bias
+        )
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.cross_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm post the cross-attention.
+        pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
+
+        _gate_ffn = self.gate_ffn.tanh() * full_text_row_masked_out_mask
+        assert isinstance(mlp_output_with_bias, tuple), "`mlp_output_with_bias` needs to be tuple for gating."
+        mlp_output_with_bias = tuple(
+            _gate_ffn * output if output is not None else None for output in mlp_output_with_bias
+        )
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+                mlp_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, None  # context
+
+
+class DummyCrossAttentionTransformerLayer(MegatronModule):
+    """Dummy cross-attention transformer block with tanh-gated attention and feedforward."""
+
+    def __call__(
+        self,
+        hidden_states: Tensor,
+        *args,
+        **kwargs,
+    ):
+        return hidden_states, None
+
+    def compute_xattn_kv_cache(self, xattn_tokens: Tensor) -> Optional[Tensor]:
+        return None
+
+
+class MLlamaCrossAttention(Attention):
+    """Cross-attention layer class for Llama VLM support
+
+    Cross-attention layer takes input with size [s, b, h] and context with size
+    [s, b, h] and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MLlamaCrossAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="cross",
+        )
+
+        # TODO might need special care when TP>8
+        assert self.query_projection_size % self.kv_projection_size == 0
+
+        self.linear_q = build_module(
+            submodules.linear_q,
+            self.config.hidden_size,
+            self.query_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        self.linear_kv = build_module(
+            submodules.linear_kv,
+            self.config.hidden_size,
+            2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        self.q_layernorm = build_module(
+            submodules.q_layernorm,
+            hidden_size=self.hidden_size_per_attention_head,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.k_layernorm = build_module(
+            submodules.k_layernorm,
+            hidden_size=self.hidden_size_per_attention_head,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+
+    def get_key_value_tensors(self, key_value_states):
+        mixed_kv, _ = self.linear_kv(key_value_states)
+
+        # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+        new_tensor_shape = mixed_kv.size()[:-1] + (
+            self.num_query_groups_per_partition,
+            2 * self.hidden_size_per_attention_head,
+        )
+        mixed_kv = mixed_kv.view(*new_tensor_shape)
+
+        # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+        (key, value) = tensor_parallel.split_tensor_along_last_dim(mixed_kv, 2)
+        # Apply LayerNorm
+        key = self.k_layernorm(key.contiguous())
+        return key, value
+
+    def get_query_tensor(self, hidden_states):
+
+        # Attention head [sq, b, h] --> [sq, b, hp]
+        query, _ = self.linear_q(hidden_states)
+
+        # [sq, b, hp] --> [sq, b, np, hn]
+        new_tensor_shape = query.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        query = query.view(*new_tensor_shape)
+
+        # Apply LayerNorm
+        query = self.q_layernorm(query)
+
+        return query
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states):
+        query = self.get_query_tensor(hidden_states)
+        key, value = self.get_key_value_tensors(key_value_states)
+        return query, key, value
+
+    def forward(
+        self,
+        hidden_states,
+        cross_attention_masks,
+        xattn_cache=None,
+        full_text_row_masked_out_mask=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+    ):
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query = self.get_query_tensor(hidden_states)
+        key, value = xattn_cache
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        # In TE "True" means masked out
+        cross_attention_masks = torch.where(cross_attention_masks == 0, False, True)
+
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                cross_attention_masks,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                cross_attention_masks,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # [b, head, s, dim]
+        core_attn_out = core_attn_out * full_text_row_masked_out_mask
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.linear_proj(core_attn_out)
+
+        return output, bias
+
+    def _compute_xattn_kv_cache(self, xattn_tokens: Tensor) -> Tensor:
+        key, value = self.get_key_value_tensors(xattn_tokens)
+        return torch.stack([key, value])
+
+
+def apply_rope_scaling(
+    inv_freq,
+    factor: int = 8,
+    low_freq_factor: int = 1,
+    high_freq_factor: int = 4,
+    old_context_len: int = 8192,
+):
+    logging.info(
+        f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, high_freq_factor={high_freq_factor}, old_context_len={old_context_len}."
+    )
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+    return inv_freq_llama
diff --git a/nemo/collections/vlm/mllama/model/mllama.py b/nemo/collections/vlm/mllama/model/mllama.py
new file mode 100644
index 0000000000000..ce618f6c36df5
--- /dev/null
+++ b/nemo/collections/vlm/mllama/model/mllama.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, Optional
+
+import torch
+import torch.distributed
+from megatron.core.transformer import TransformerConfig
+from torch import Tensor
+
+from nemo.collections.vlm.mllama.model.base import (
+    CrossAttentionTextConfig,
+    CrossAttentionVisionConfig,
+    MLlamaModel,
+    MLlamaModelConfig,
+)
+from nemo.lightning import MegatronStrategy, Trainer, io, teardown
+from nemo.lightning.pytorch.utils import dtype_from_hf
+
+
+@dataclass
+class MLlamaConfig11B(MLlamaModelConfig):
+    language_model_config: Optional[TransformerConfig] = field(default_factory=lambda: CrossAttentionTextConfig())
+    vision_model_config: Optional[TransformerConfig] = field(
+        default_factory=lambda: CrossAttentionVisionConfig(vision_chunk_size=448)
+    )
+
+
+@dataclass
+class MLlamaConfig11BInstruct(MLlamaModelConfig):
+    language_model_config: Optional[TransformerConfig] = field(default_factory=lambda: CrossAttentionTextConfig())
+    vision_model_config: Optional[TransformerConfig] = field(
+        default_factory=lambda: CrossAttentionVisionConfig(vision_chunk_size=560)
+    )
+
+
+@dataclass
+class MLlamaConfig90B(MLlamaModelConfig):
+    language_model_config: Optional[TransformerConfig] = field(
+        default_factory=lambda: CrossAttentionTextConfig(
+            hidden_size=8192,
+            ffn_hidden_size=28672,
+            num_attention_heads=64,
+            num_layers=80,
+            num_cross_attention_layers=20,
+        )
+    )
+    vision_model_config: Optional[TransformerConfig] = field(
+        default_factory=lambda: CrossAttentionVisionConfig(vision_chunk_size=560, text_hidden_size=8192)
+    )
+
+
+@dataclass
+class MLlamaConfig90BInstruct(MLlamaConfig90B):
+    pass
+
+
+@io.model_importer(MLlamaModel, "hf")
+class HFMLlamaImporter(io.ModelConnector["MLlamaModel", MLlamaModel]):
+    def init(self) -> MLlamaModel:
+        return MLlamaModel(self.config, tokenizer=self.tokenizer)
+
+    def local_path(self, base_path: Optional[Path] = None) -> Path:
+        # note: this entire function is for debugging
+        output_path = super().local_path(base_path)
+        return output_path
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import MllamaForConditionalGeneration
+
+        source = MllamaForConditionalGeneration.from_pretrained(str(self), torch_dtype='auto')
+
+        class ModelState:
+            def __init__(self, state_dict):
+                self._state_dict = state_dict
+
+            def state_dict(self):
+                return self._state_dict
+
+        state_dict = _rename_xattn_layer_nums_hf(source.state_dict())
+        source = ModelState(state_dict)
+        target = self.init()
+        dummy_trainer = Trainer(
+            devices=1,
+            accelerator="cpu",
+            strategy=MegatronStrategy(
+                store_optimizer_states=False,
+                save_ckpt_format='torch_dist',
+            ),
+        )
+        trainer = self.nemo_setup(target, dummy_trainer)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted Mllama model to Nemo, model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {}
+        transforms = []
+        mapping.update(
+            {
+                "language_model.model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
+                "language_model.model.xattn_layers.*.cross_attn.o_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_proj.weight",
+                "language_model.model.xattn_layers.*.cross_attn.q_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.weight",
+                "language_model.model.norm.weight": "language_model.decoder.final_layernorm.weight",
+                "language_model.lm_head.weight": "language_model.output_layer.weight",
+                "language_model.model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+                "language_model.model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
+                "language_model.model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+                "language_model.model.xattn_layers.*.cross_attn.k_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.k_layernorm.weight",
+                "language_model.model.xattn_layers.*.input_layernorm.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.layer_norm_weight",
+                "language_model.model.xattn_layers.*.cross_attn.q_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.q_layernorm.weight",
+                "language_model.model.xattn_layers.*.post_attention_layernorm.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc1.layer_norm_weight",
+                "language_model.model.xattn_layers.*.mlp.down_proj.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc2.weight",
+            }
+        )
+
+        transforms.extend(
+            [
+                io.state_transform(
+                    source_key="language_model.model.xattn_layers.*.cross_attn_attn_gate",
+                    target_key="language_model.decoder.xattn_layers.*.gate_attn",
+                    fn=_import_gate,
+                ),
+                io.state_transform(
+                    source_key="language_model.model.xattn_layers.*.cross_attn_mlp_gate",
+                    target_key="language_model.decoder.xattn_layers.*.gate_ffn",
+                    fn=_import_gate,
+                ),
+                io.state_transform(
+                    source_key=(
+                        "language_model.model.layers.*.self_attn.q_proj.weight",
+                        "language_model.model.layers.*.self_attn.k_proj.weight",
+                        "language_model.model.layers.*.self_attn.v_proj.weight",
+                    ),
+                    target_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
+                    fn=_import_text_qkv,
+                ),
+                io.state_transform(
+                    source_key=(
+                        "language_model.model.layers.*.mlp.gate_proj.weight",
+                        "language_model.model.layers.*.mlp.up_proj.weight",
+                    ),
+                    target_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
+                    fn=_import_simple_concat,
+                ),
+                io.state_transform(
+                    source_key=(
+                        "language_model.model.xattn_layers.*.cross_attn.k_proj.weight",
+                        "language_model.model.xattn_layers.*.cross_attn.v_proj.weight",
+                    ),
+                    target_key="language_model.decoder.xattn_layers.*.cross_attention.linear_kv.weight",
+                    fn=_import_text_kv,
+                ),
+                io.state_transform(
+                    source_key=(
+                        "language_model.model.xattn_layers.*.mlp.gate_proj.weight",
+                        "language_model.model.xattn_layers.*.mlp.up_proj.weight",
+                    ),
+                    target_key="language_model.decoder.xattn_layers.*.mlp.linear_fc1.weight",
+                    fn=_import_simple_concat,
+                ),
+                io.state_transform(
+                    source_key="language_model.model.embed_tokens.weight",
+                    target_key=(
+                        "language_model.embedding.word_embeddings.weight",
+                        "language_model.learnable_embedding.weight",
+                    ),
+                    fn=_import_embedding_hf,
+                ),
+            ]
+        )
+
+        v = "vision_model.vision_encoder"
+        mapping.update(
+            {
+                "vision_model.global_transformer.layers.*.self_attn.o_proj.weight": f"{v}.global_transformer.layers.*.self_attention.linear_proj.weight",
+                "vision_model.global_transformer.layers.*.gate_attn": f"{v}.global_transformer.layers.*.gate_attn",
+                "vision_model.global_transformer.layers.*.gate_ffn": f"{v}.global_transformer.layers.*.gate_ffn",
+                "vision_model.global_transformer.layers.*.input_layernorm.bias": f"{v}.global_transformer.layers.*.input_layernorm.bias",
+                "vision_model.global_transformer.layers.*.input_layernorm.weight": f"{v}.global_transformer.layers.*.input_layernorm.weight",
+                "vision_model.global_transformer.layers.*.post_attention_layernorm.bias": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.bias",
+                "vision_model.global_transformer.layers.*.post_attention_layernorm.weight": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.weight",
+                "vision_model.global_transformer.layers.*.mlp.fc1.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc1.bias",
+                "vision_model.global_transformer.layers.*.mlp.fc1.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc1.weight",
+                "vision_model.global_transformer.layers.*.mlp.fc2.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc2.bias",
+                "vision_model.global_transformer.layers.*.mlp.fc2.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc2.weight",
+                "vision_model.transformer.layers.*.self_attn.o_proj.weight": f"{v}.transformer.layers.*.self_attention.linear_proj.weight",
+                "vision_model.transformer.layers.*.input_layernorm.bias": f"{v}.transformer.layers.*.input_layernorm.bias",
+                "vision_model.transformer.layers.*.input_layernorm.weight": f"{v}.transformer.layers.*.input_layernorm.weight",
+                "vision_model.transformer.layers.*.post_attention_layernorm.bias": f"{v}.transformer.layers.*.pre_mlp_layernorm.bias",
+                "vision_model.transformer.layers.*.post_attention_layernorm.weight": f"{v}.transformer.layers.*.pre_mlp_layernorm.weight",
+                "vision_model.transformer.layers.*.mlp.fc1.bias": f"{v}.transformer.layers.*.mlp.linear_fc1.bias",
+                "vision_model.transformer.layers.*.mlp.fc1.weight": f"{v}.transformer.layers.*.mlp.linear_fc1.weight",
+                "vision_model.transformer.layers.*.mlp.fc2.bias": f"{v}.transformer.layers.*.mlp.linear_fc2.bias",
+                "vision_model.transformer.layers.*.mlp.fc2.weight": f"{v}.transformer.layers.*.mlp.linear_fc2.weight",
+                "vision_model.class_embedding": f"{v}.class_embedding",
+                "vision_model.gated_positional_embedding.embedding": f"{v}.positional_embedding",
+                "vision_model.gated_positional_embedding.tile_embedding.weight": f"{v}.gated_tile_positional_embedding.weight",
+                "vision_model.gated_positional_embedding.gate": f"{v}.gated_positional_embedding_gate",
+                "vision_model.layernorm_post.bias": f"{v}.ln_post.bias",
+                "vision_model.layernorm_post.weight": f"{v}.ln_post.weight",
+                "vision_model.layernorm_pre.bias": f"{v}.ln_pre.bias",
+                "vision_model.layernorm_pre.weight": f"{v}.ln_pre.weight",
+                "vision_model.post_tile_positional_embedding.embedding.weight": f"{v}.post_tile_pos_embed.embedding.weight",
+                "vision_model.post_tile_positional_embedding.gate": f"{v}.post_tile_pos_embed.gate",
+                "vision_model.pre_tile_positional_embedding.embedding.weight": f"{v}.pre_tile_pos_embed.embedding.weight",
+                "vision_model.pre_tile_positional_embedding.gate": f"{v}.pre_tile_pos_embed.gate",
+                "multi_modal_projector.bias": "vision_model.vision_projection.encoder.bias",
+                "multi_modal_projector.weight": "vision_model.vision_projection.encoder.weight",
+            }
+        )
+        transforms.extend(
+            [
+                io.state_transform(
+                    source_key=(
+                        "vision_model.global_transformer.layers.*.self_attn.q_proj.weight",
+                        "vision_model.global_transformer.layers.*.self_attn.k_proj.weight",
+                        "vision_model.global_transformer.layers.*.self_attn.v_proj.weight",
+                    ),
+                    target_key=(f"{v}.global_transformer.layers.*.self_attention.linear_qkv.weight"),
+                    fn=_import_vision_qkv,
+                ),
+                io.state_transform(
+                    source_key=(
+                        "vision_model.transformer.layers.*.self_attn.q_proj.weight",
+                        "vision_model.transformer.layers.*.self_attn.k_proj.weight",
+                        "vision_model.transformer.layers.*.self_attn.v_proj.weight",
+                    ),
+                    target_key=(f"{v}.transformer.layers.*.self_attention.linear_qkv.weight"),
+                    fn=_import_vision_qkv,
+                ),
+                io.state_transform(
+                    source_key="vision_model.patch_embedding.weight",
+                    target_key=f"{v}.conv1._linear.weight",
+                    fn=_import_patch_embedding_hf,
+                ),
+            ]
+        )
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=transforms)
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
+
+    @property
+    def config(self) -> MLlamaModelConfig:
+        from transformers import AutoConfig
+
+        source = AutoConfig.from_pretrained(str(self))
+
+        return MLlamaModelConfig(
+            language_model_config=self._language_model_config(source),
+            vision_model_config=self._vision_model_config(source),
+        )
+
+    def _language_model_config(self, source) -> Optional[CrossAttentionTextConfig]:
+        def _calculate_num_layers(num_hidden_layers, cross_attention_layers):
+            return num_hidden_layers - len(cross_attention_layers)
+
+        return CrossAttentionTextConfig(
+            rotary_base=source.text_config.rope_theta,
+            seq_length=8192,
+            num_layers=_calculate_num_layers(
+                source.text_config.num_hidden_layers, source.text_config.cross_attention_layers
+            ),
+            num_cross_attention_layers=len(source.text_config.cross_attention_layers),
+            hidden_size=source.text_config.hidden_size,
+            ffn_hidden_size=source.text_config.intermediate_size,
+            num_attention_heads=source.text_config.num_attention_heads,
+            num_query_groups=source.text_config.num_key_value_heads,
+            vocab_size=source.text_config.vocab_size,
+            fp16=(dtype_from_hf(source) == torch.float16),
+            bf16=(dtype_from_hf(source) == torch.bfloat16),
+            params_dtype=dtype_from_hf(source),
+        )
+
+    def _vision_model_config(self, source) -> Optional[CrossAttentionVisionConfig]:
+        return CrossAttentionVisionConfig(
+            num_layers=source.vision_config.num_hidden_layers,
+            hidden_size=source.vision_config.hidden_size,
+            num_attention_heads=source.vision_config.attention_heads,
+            vision_chunk_size=source.vision_config.image_size,
+            vision_max_num_chunks=source.vision_config.max_num_tiles,
+            text_hidden_size=source.text_config.hidden_size,
+            fp16=(dtype_from_hf(source) == torch.float16),
+            bf16=(dtype_from_hf(source) == torch.bfloat16),
+            params_dtype=dtype_from_hf(source),
+        )
+
+
+def _rename_xattn_layer_nums_hf(source: Dict):
+    def convert_layer_num(match):
+        layer_num = int(match.group(1))
+        cross_num = (layer_num - 3) // (cross_attention_frequency + 1)
+        if (layer_num - 3) % (cross_attention_frequency + 1) == 0:
+            new_layer_num = cross_num * cross_attention_frequency + 3
+            return f'xattn_layers.{new_layer_num}.'
+
+        new_layer_num = layer_num - cross_num - 1
+        return f'layers.{new_layer_num}.'
+
+    cross_attention_frequency = 4
+
+    output_dict = {}
+    for k, v in source.items():
+        if "language_model" in k:
+            output_dict[re.sub(r"layers\.(\d+)\.", convert_layer_num, k)] = v
+        else:
+            output_dict[k] = v
+    return output_dict
+
+
+def _import_embedding_hf(a):
+    return torch.split(a, a.shape[0] - 8, dim=0)
+
+
+def _import_patch_embedding_hf(a):
+    return a.reshape(a.shape[0], -1)
+
+
+def _import_gate(gate):
+    return gate[0:1]
+
+
+def _import_vision_qkv(ctx: io.TransformCTX, q, k, v):
+    vision_config = ctx.target.config.vision_model_config
+
+    head_num = vision_config.num_attention_heads
+    num_query_groups = vision_config.num_query_groups
+    head_size = vision_config.kv_channels
+    hidden_size = vision_config.hidden_size
+    return _merge_qkv(q, k, v, head_num, num_query_groups, head_size, hidden_size)
+
+
+def _import_text_qkv(ctx: io.TransformCTX, q, k, v):
+    text_config = ctx.target.config.language_model_config
+
+    head_num = text_config.num_attention_heads
+    num_query_groups = text_config.num_query_groups
+    head_size = text_config.kv_channels
+    hidden_size = text_config.hidden_size
+    return _merge_qkv(q, k, v, head_num, num_query_groups, head_size, hidden_size)
+
+
+def _import_text_kv(ctx: io.TransformCTX, k, v):
+    text_config = ctx.target.config.language_model_config
+
+    head_num = text_config.num_attention_heads
+    num_query_groups = text_config.num_query_groups
+    head_size = text_config.kv_channels
+    hidden_size = text_config.hidden_size
+    return _merge_kv(k, v, head_num, num_query_groups, head_size, hidden_size)
+
+
+def _merge_kv(k: Tensor, v: Tensor, head_num: int, num_query_groups: int, head_size: int, hidden_size: int):
+    old_tensor_shape = k.size()
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    kv_weights = torch.stack((k, v), dim=1)
+    kv_weights = kv_weights.reshape(-1, *new_kv_tensor_shape[1:])
+    assert kv_weights.ndim == 3, kv_weights.shape
+    assert kv_weights.shape[0] == 2 * num_query_groups, kv_weights.shape
+    assert kv_weights.shape[1] == head_size, kv_weights.shape
+    assert kv_weights.shape[2] == old_tensor_shape[1], kv_weights.shape
+
+    kv_weights = kv_weights.reshape([head_size * 2 * num_query_groups, hidden_size])
+    return kv_weights
+
+
+def _merge_qkv(
+    q: Tensor, k: Tensor, v: Tensor, head_num: int, num_query_groups: int, head_size: int, hidden_size: int
+):
+    heads_per_group = head_num // num_query_groups
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+def _split_qkv(qkv, head_num: int, num_query_groups: int, head_size: int, hidden_size: int):
+    heads_per_group = head_num // num_query_groups
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+def _import_simple_concat(a, b):
+    # for both (w1, w3) -> fc1, and (wk, wv) -> wkv
+    return torch.cat((a, b), dim=0)
+
+
+def _rename_xattn_layer_nums(source: Dict):
+    def convert_layer_num(match):
+        new_layer_num = int(match.group(1)) * 4 + 3
+        return f'.{new_layer_num}.'
+
+    output_dict = {}
+    for k, v in source.items():
+        if "cross_attention_layers" in k:
+            output_dict[re.sub(r"\.(\d+)\.", convert_layer_num, k)] = v
+        else:
+            output_dict[k] = v
+    return output_dict
diff --git a/nemo/collections/vlm/mllama/model/utils.py b/nemo/collections/vlm/mllama/model/utils.py
new file mode 100644
index 0000000000000..786be18020a43
--- /dev/null
+++ b/nemo/collections/vlm/mllama/model/utils.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Tuple
+
+import torch
+
+
+def _pad_attention_masks(
+    masks: torch.Tensor,
+    num_chunks: torch.Tensor,
+    total_length: int,
+    max_chunks: int,
+    device: torch.device,
+    dtype=torch.bfloat16,
+) -> torch.Tensor:
+    """
+    Pads the provided masks to a uniform shape for batching.
+
+    Args:
+        masks (torch.Tensor): List of tensors containing attention masks for each batch.
+        num_chunks (torch.Tensor): Tensor containing the number of chunks for each mask.
+        total_length (int): Total sequence length for padding.
+        max_chunks (int): Maximum number of chunks to pad each mask to.
+        device (torch.device): Device to place the output tensor on.
+        dtype (torch.dtype): Data type for the output tensor. Default is `torch.bfloat16`.
+
+    Returns:
+        torch.Tensor: A padded tensor of shape [B, total_length, max_num_media, max_chunks]
+        where `B` is the batch size.
+    """
+    mask_value = 1.0
+    batch_size = len(masks)
+    max_num_media = max([len(m) for m in masks])
+
+    padded_masks = torch.full(
+        (batch_size, total_length, max_num_media, max_chunks),
+        mask_value,
+        dtype=dtype,
+        device=device,
+    )
+
+    for idx, (mask_group, chunks) in enumerate(zip(masks, num_chunks)):
+        for media_idx, (mask, chunk_count) in enumerate(zip(mask_group, chunks)):
+            if len(mask) == 2:
+                mask[1] = min(mask[1], total_length)
+                if mask[1] == -1:
+                    mask[1] = total_length
+                padded_masks[idx, mask[0] : mask[1], media_idx, :chunk_count].fill_(0.0)
+
+    return padded_masks
+
+
+def _get_full_row_masked_out_mask(
+    attention_bias: torch.Tensor,
+    mask_value: float,
+):
+    """
+    Determines whether each row in the attention bias tensor contains masked values.
+
+    Args:
+        attention_bias (torch.Tensor): A 4D tensor of shape [B, H, S1, S2], where:
+            - B: Batch size.
+            - H: Number of attention heads.
+            - S1: Length of the first sequence.
+            - S2: Length of the second sequence.
+        mask_value (float): The value used to represent masked positions in `attention_bias`.
+
+    Returns:
+        torch.Tensor: A 4D tensor of shape [B, H, S1, 1], containing boolean values (as a tensor)
+        indicating if each row in the last dimension is fully masked (0 if fully masked, 1 otherwise).
+    """
+    return (attention_bias != mask_value).any(dim=-1).type_as(attention_bias)[..., None]
+
+
+def _generate_cross_attention_mask(
+    text_token_count: int,
+    text_device: torch.device,
+    text_dtype: torch.dtype,
+    vision_tokens: torch.Tensor,
+    cross_attention_masks: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generates a cross-attention mask for aligning text and vision tokens.
+
+    Args:
+        text_token_count (int): Number of tokens in the text sequence.
+        text_device (torch.device): Device to place the output tensor on.
+        text_dtype (torch.dtype): Data type for the output tensor.
+        vision_tokens (torch.Tensor): Vision tokens tensor of shape [B, I, T, D] where:
+            - B: Batch size.
+            - I: Number of images.
+            - T: Number of image tokens per image.
+            - D: Dimension of each image token.
+        cross_attention_masks (torch.Tensor): Cross attention masks of shape [B, N, I, C], where:
+            - B: Batch size.
+            - N: Number of text tokens.
+            - I: Number of images.
+            - C: Number of chunks.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The adjusted cross-attention masks of shape [B, 1, N, I * T].
+            - The full row mask status tensor of shape [B, 1, N, 1].
+    """
+    assert vision_tokens is not None, "Vision tokens must be provided"
+    vision_token_length = vision_tokens.shape[3]
+    assert (
+        vision_tokens.shape[1] == cross_attention_masks.shape[2]
+    ), f"Mismatch in number of images given and number of masks provided: {vision_tokens.shape} vs {cross_attention_masks.shape}"
+    assert (
+        vision_tokens.shape[2] == cross_attention_masks.shape[3]
+    ), f"Mismatch between vision tokens and cross-attention masks: {vision_tokens.shape} vs {cross_attention_masks.shape}"
+    assert (
+        text_token_count == cross_attention_masks.shape[1]
+    ), f"Text sequence length {text_token_count} does not match cross-attention mask length {cross_attention_masks.shape[1]}"
+
+    batch_size, _, num_images, num_chunks = cross_attention_masks.shape
+    cross_attention_masks = cross_attention_masks.view(batch_size, text_token_count, -1).unsqueeze(1)
+
+    full_row_mask_status = _get_full_row_masked_out_mask(cross_attention_masks, mask_value=1.0)
+    cross_attention_masks = cross_attention_masks.repeat_interleave(vision_token_length, dim=3)
+    cross_attention_masks *= full_row_mask_status
+
+    return (
+        cross_attention_masks.to(device=text_device, dtype=text_dtype),
+        full_row_mask_status.to(device=text_device, dtype=text_dtype),
+    )
+
+
+def create_vision_mask_tensor(tokens: torch.Tensor, vision_token_id: int = 128256) -> torch.Tensor:
+    """
+    Create a vision mask from a tensor of tokens and a vision token ID.
+
+    Args:
+        tokens (torch.Tensor): A 1D tensor of token IDs.
+        vision_token_id (int): The ID of the vision token.
+
+    Returns:
+        torch.Tensor: A tensor containing vision masks in the format [start, end].
+    """
+    # Get the locations of the vision tokens
+    vision_token_locations = (tokens == vision_token_id).nonzero(as_tuple=False).squeeze()
+
+    # If no vision token found, return an empty tensor
+    if vision_token_locations.numel() == 0:
+        return torch.empty(1, 2, dtype=torch.long)
+
+    vision_masks = []
+
+    # Handle case with only one vision token
+    if vision_token_locations.numel() == 1:
+        vision_masks.append([vision_token_locations.item(), len(tokens)])
+    else:
+        # Multiple vision tokens, pairwise masks
+        for i in range(len(vision_token_locations) - 1):
+            vision_masks.append([vision_token_locations[i].item(), vision_token_locations[i + 1].item()])
+        # Last vision token attends to all subsequent text
+        vision_masks.append([vision_token_locations[-1].item(), len(tokens)])
+
+    # Handle consecutive vision tokens
+    last_mask_end = vision_masks[-1][1]
+    for vision_mask in reversed(vision_masks):
+        if vision_mask[0] == vision_mask[1] - 1:
+            vision_mask[1] = last_mask_end
+        last_mask_end = vision_mask[1]
+
+    return torch.tensor(vision_masks, dtype=torch.long)
diff --git a/nemo/collections/vlm/mllama/model/vision.py b/nemo/collections/vlm/mllama/model/vision.py
new file mode 100644
index 0000000000000..f662546d21ae9
--- /dev/null
+++ b/nemo/collections/vlm/mllama/model/vision.py
@@ -0,0 +1,640 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+import math
+import types
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from megatron.core import InferenceParams, parallel_state, tensor_parallel
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import make_viewless_tensor
+from PIL import Image as PIL_Image
+from torch import Tensor, nn
+
+if TYPE_CHECKING:
+    from nemo.collections.vlm import CrossAttentionVisionConfig
+
+try:
+    from megatron.core.transformer.custom_layers.transformer_engine import TEDelayedScaling, TENorm
+
+    HAVE_TE = True
+    LayerNormImpl = TENorm
+except ImportError:
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    HAVE_TE = False
+    LayerNormImpl = WrappedTorchLayerNorm
+
+
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable):
+        return x
+    return (x, x)
+
+
+def _stack_images(
+    images: List[List[PIL_Image.Image]],
+    max_num_chunks: int,
+    image_res: int,
+    max_num_images: int,
+) -> Tuple[torch.Tensor, List[int]]:
+    """
+    Takes a list of list of images and stacks them into a tensor.
+    This function is needed since images can be of completely
+    different resolutions and aspect ratios.
+    """
+    out_images, out_num_chunks = [], []
+    for imgs_sample in images:
+        out_images_i = torch.zeros(
+            max_num_images,
+            max_num_chunks,
+            3,
+            image_res,
+            image_res,
+        )
+        _num_chunks = []
+        for j, chunks_image in enumerate(imgs_sample):
+            out_images_i[j, : chunks_image.shape[0]] = chunks_image
+            _num_chunks.append(chunks_image.shape[0])
+        out_images.append(out_images_i)
+        out_num_chunks.append(_num_chunks)
+    return torch.stack(out_images), out_num_chunks
+
+
+def build_encoder_attention_mask(
+    x: torch.Tensor, ar_ids: torch.Tensor, ntok: int, num_chunks: int, supported_aspect_ratios: List[List[int]]
+):
+    """
+    Build vision encoder attention mask that omits padding tiles and tokens.
+    """
+    masks = []
+    for ar_id in ar_ids:
+        arx = supported_aspect_ratios[ar_id - 1]
+        mask_i = torch.ones((num_chunks, x.shape[1] // num_chunks), device=x.device)
+        mask_i[: arx[0] * arx[1], :ntok] = 0
+        mask_i = mask_i.view(num_chunks * x.shape[1] // num_chunks, -1)
+        mask_i = (mask_i @ mask_i.T).type(torch.bool)
+        mask_i = mask_i.unsqueeze(0)
+        masks.append(mask_i)
+    masks = torch.stack(masks)
+    return masks
+
+
+def apply_scaling(freqs: torch.Tensor):
+    # Values obtained from grid search
+    scale_factor = 8
+    low_freq_factor = 1
+    high_freq_factor = 4
+    old_context_len = 8192  # original llama3 length
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    new_freqs = []
+    for freq in freqs:
+        wavelen = 2 * math.pi / freq
+        if wavelen < high_freq_wavelen:
+            new_freqs.append(freq)
+        elif wavelen > low_freq_wavelen:
+            new_freqs.append(freq / scale_factor)
+        else:
+            assert low_freq_wavelen != high_freq_wavelen
+            smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+            new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
+# Use this spec for an implementation using modules in TE
+def get_image_transformer_layer_spec() -> ModuleSpec:
+    image_transformer_submodules = TransformerLayerSubmodules(
+        input_layernorm=TENorm,
+        self_attention=ModuleSpec(
+            module=SelfAttentionNoBias,
+            params={"attn_mask_type": AttnMaskType.no_mask},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TEColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        pre_mlp_layernorm=TENorm,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(
+                linear_fc1=TEColumnParallelLinear,
+                linear_fc2=TERowParallelLinear,
+            ),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    )
+    return ModuleSpec(module=ImageTransformerLayer, submodules=image_transformer_submodules)
+
+
+def forward_with_return_intermediate(
+    self,
+    hidden_states: Tensor,
+    attention_mask: Tensor,
+    context: Tensor = None,
+    context_mask: Tensor = None,
+    rotary_pos_emb: Tensor = None,
+    inference_params: InferenceParams = None,
+    packed_seq_params: PackedSeqParams = None,
+    return_intermediate: List[int] = None,
+):
+    # hidden_states (float): [s, b, h]
+    # attention_mask (bool): [1, 1, s, s]
+
+    if not self.pre_process:
+        # See set_input_tensor()
+        hidden_states = self.input_tensor
+
+    hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
+
+    if self.config.sequence_parallel:
+        rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+    else:
+        rng_context = nullcontext()
+
+    if self.config.fp8:
+        import transformer_engine  # To keep out TE dependency when not training in fp8
+
+        if self.config.fp8 == "e4m3":
+            fp8_format = transformer_engine.common.recipe.Format.E4M3
+        elif self.config.fp8 == "hybrid":
+            fp8_format = transformer_engine.common.recipe.Format.HYBRID
+        else:
+            raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
+
+        fp8_recipe = TEDelayedScaling(
+            config=self.config,
+            fp8_format=fp8_format,
+            override_linear_precision=(False, False, not self.config.fp8_wgrad),
+        )
+        fp8_group = None
+        if parallel_state.model_parallel_is_initialized():
+            fp8_group = parallel_state.get_amax_reduction_group(with_context_parallel=True)
+        fp8_context = transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group)
+    else:
+        fp8_context = nullcontext()
+
+    with rng_context and fp8_context:
+        # Forward pass.
+        if self.config.recompute_granularity == 'full' and self.training:
+            assert return_intermediate is None, (
+                "Config `return_intermediate` cannot be used with " "`recompute_granularity='full'`. "
+            )
+            hidden_states = self._checkpointed_forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                context=context,
+                context_mask=context_mask,
+                rotary_pos_emb=rotary_pos_emb,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            intermediate_hidden_states = []
+            for l_no, layer in enumerate(self.layers):
+                if return_intermediate is not None and l_no in return_intermediate:
+                    intermediate_hidden_states.append(hidden_states)
+
+                with self.offload_context:
+                    if (len(self.cuda_graphs) == 0) or (not self.training):
+                        hidden_states, context = layer(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            context=context,
+                            context_mask=context_mask,
+                            rotary_pos_emb=rotary_pos_emb,
+                            inference_params=inference_params,
+                            packed_seq_params=packed_seq_params,
+                        )
+                        # CUDA graph doesn't output context and is expected to be None
+                        assert (context is None) or (not self.config.enable_cuda_graph) or (not self.training)
+                    else:
+                        # CUDA graph replay for layer `l_no` and microbatch `self.current_microbatch`
+                        # CUDA graph requires positional arguments with the exception of is_first_microbatch.
+                        # Also CUDA graph accepts only Tensor inputs and outputs. Hence, the arg list and
+                        # returned list is limited to `hidden_states`.
+                        assert (len(self.cuda_graphs) > l_no) and (
+                            self.current_microbatch < len(self.cuda_graphs[l_no])
+                        )
+                        hidden_states = self.cuda_graphs[l_no][self.current_microbatch](
+                            hidden_states, is_first_microbatch=(self.current_microbatch == 0)
+                        )
+
+                if (
+                    torch.is_grad_enabled()
+                    and self.config.cpu_offloading
+                    and self.group_prefetch_offload_commit_async is not None
+                ):
+                    hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
+
+        # Final layer norm.
+        if self.final_layernorm is not None:
+            hidden_states = self.final_layernorm(hidden_states)
+            # TENorm produces a "viewed" tensor. This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
+
+        if return_intermediate is not None:
+            return hidden_states, torch.stack(intermediate_hidden_states, dim=-1)
+
+        return hidden_states
+
+
+class ColumnParallelConv2dPatch(MegatronModule):
+    """Conv2D Patching layer with model parallelism.
+    Column parallel over unfolded input.
+    Arguments:
+        in_channels: Input channels.
+        out_channels: Output channels.
+        kernel_size: Size of convolution kernel.
+        stride (default 1): Stride for convolution.
+        bias (default False): Use bias in Conv2d.
+    Input: (bsz, in_channels, width, height)
+    Output: (bsz, num_tokens, out_channels)
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]],
+        bias: Optional[bool] = False,
+    ) -> None:
+        super().__init__(config=config)
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride)
+        self._linear = TEColumnParallelLinear(
+            in_channels * kernel_size[0] * kernel_size[1],
+            out_channels,
+            bias=bias,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='conv1',
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._unfold(x)
+        x = x.permute(0, 2, 1)
+        x = F.linear(x, self._linear.weight)
+        x = tensor_parallel.gather_from_tensor_model_parallel_region(x)
+        return x
+
+
+class PrecomputedTilePositionEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        gated: bool = False,
+    ):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+
+        self.embedding = nn.Embedding(self.max_aspect_ratio_id + 1, self.max_num_tiles * self.hidden_size)
+        self.gated = gated
+        if gated:
+            self.gate = nn.Parameter(torch.zeros(1))
+
+    def forward(self, hidden_states: torch.Tensor, aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
+
+        if self.gated:
+            embeddings = embeddings * self.gate.tanh()
+
+        hidden_states = hidden_states + embeddings
+        return hidden_states
+
+
+class SelfAttentionNoBias(SelfAttention):
+    """Self-attention layer class without bias"""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: SelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+        )
+
+        # Override to remove bias since we don't have a good config for this.
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='qkv',
+        )
+
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=False,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='proj',
+        )
+
+
+class ImageTransformerLayer(TransformerLayer):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            hidden_dropout=hidden_dropout,
+        )
+        self.gated = self.config.gated
+        if self.gated:
+            self.gate_attn = nn.Parameter(torch.zeros(1, dtype=self.config.params_dtype))
+            self.gate_ffn = nn.Parameter(torch.zeros(1, dtype=self.config.params_dtype))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        rotary_pos_cos=None,
+        rotary_pos_sin=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Input Layer norm
+        input_layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output_with_bias = self.self_attention(
+            input_layernorm_output,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+            packed_seq_params=packed_seq_params,
+        )
+
+        _gate_attn = 1 if not self.gated else self.gate_attn.tanh()
+        assert isinstance(
+            attention_output_with_bias, tuple
+        ), "`attention_output_with_bias` needs to be tuple for gating."
+        attention_output_with_bias = tuple(
+            _gate_attn * output if output is not None else None for output in attention_output_with_bias
+        )
+
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm post the cross-attention.
+        pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
+
+        _gate_ffn = 1 if not self.gated else self.gate_ffn.tanh()
+        assert isinstance(mlp_output_with_bias, tuple), "`mlp_output_with_bias` needs to be tuple for gating."
+        mlp_output_with_bias = tuple(
+            _gate_ffn * output if output is not None else None for output in mlp_output_with_bias
+        )
+
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+                mlp_output_with_bias, residual, self.hidden_dropout
+            )
+
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        # CUDA graph requires returned values to be Tensors
+        if self.config.external_cuda_graph and self.training:
+            return output
+        return output, context
+
+
+class VisionEncoder(MegatronModule):
+    def __init__(
+        self,
+        config: 'CrossAttentionVisionConfig',
+        image_size: int = 560,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        pre_process: bool = True,
+        post_process: bool = True,
+        return_intermediate=None,
+    ):
+        super().__init__(config=config)
+        self.return_intermediate = return_intermediate
+        self.image_size = to_2tuple(image_size)
+        self.patch_size = to_2tuple(patch_size)
+        self.grid_size = (
+            self.image_size[0] // self.patch_size[0],
+            self.image_size[1] // self.patch_size[1],
+        )
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        self.max_aspect_ratio_id = self.config.max_aspect_ratio_id
+        self.max_num_tiles = config.max_num_tiles
+        width = config.hidden_size
+        self.conv1 = ColumnParallelConv2dPatch(
+            config=config,
+            in_channels=in_channels,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width))
+        self.ln_post = LayerNormImpl(config=config, hidden_size=width)
+        self.ln_pre = LayerNormImpl(config=config, hidden_size=width)
+        self.transformer = TransformerBlock(
+            config=self.config,
+            spec=get_image_transformer_layer_spec(),
+            post_layer_norm=False,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        self.transformer.forward = types.MethodType(forward_with_return_intermediate, self.transformer)
+        # pre and post tile position embedding
+        global_config = copy.deepcopy(self.config)
+        global_config.num_layers = self.config.num_global_layers
+        global_config.gated = True
+        self.global_transformer = TransformerBlock(
+            config=global_config,
+            spec=get_image_transformer_layer_spec(),
+            post_layer_norm=False,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        # pre and post tile position embedding
+        self.pre_tile_pos_embed = PrecomputedTilePositionEmbedding(
+            config=config,
+            gated=True,
+        )
+        self.post_tile_pos_embed = PrecomputedTilePositionEmbedding(
+            config=config,
+            gated=True,
+        )
+        self.gated_tile_positional_embedding = nn.Embedding(
+            self.max_aspect_ratio_id + 1, self.max_num_tiles * (self.grid_size[0] * self.grid_size[1] + 1) * width
+        )
+        self.gated_positional_embedding_gate = nn.Parameter(torch.zeros(1))
+
+    def apply_positional_embedding(self, x, aspect_ratio_ids):
+        # apply regular position embedding
+        bsz, num_chunks, num_tokens, dim = x.shape
+        x = x.view(bsz * num_chunks, num_tokens, dim)
+        x = x + self.positional_embedding * (1 - self.gated_positional_embedding_gate.tanh())
+        x = x.view(bsz, num_chunks, num_tokens, dim)
+        tile_position_embedding = self.gated_tile_positional_embedding(aspect_ratio_ids)
+        tile_position_embedding = tile_position_embedding.reshape(bsz, num_chunks, num_tokens, dim)
+        x = x + self.gated_positional_embedding_gate.tanh() * tile_position_embedding
+        return x
+
+    def apply_class_embedding(self, x):
+        x = torch.cat(
+            [
+                self.class_embedding.to(x.dtype)
+                + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+                x,
+            ],
+            dim=1,
+        )  # shape = [*, grid ** 2 + 1, width]
+        return x
+
+    def forward(self, images: torch.Tensor, ar_ids: torch.Tensor) -> torch.Tensor:
+        if images.ndim == 5:
+            num_concurrent_media = 1
+            bsz, num_chunks, nch, w, h = images.shape
+        else:
+            bsz, num_concurrent_media, num_chunks, nch, w, h = images.shape
+
+        images = images.reshape(bsz * num_concurrent_media * num_chunks, nch, w, h)
+        ar_ids = ar_ids.reshape(bsz * num_concurrent_media, 1)
+
+        # patch embedding
+        x = images.reshape(bsz * num_concurrent_media * num_chunks, nch, w, h)
+        x = self.conv1(x)  # shape = [*, width, grid ** 2]
+        _, ntok, dim = x.shape
+        x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok, dim)
+
+        # tile embeddings
+        x = self.pre_tile_pos_embed(x, ar_ids)
+        x = x.reshape(bsz * num_concurrent_media * num_chunks, ntok, dim)
+
+        # apply cls token
+        x = self.apply_class_embedding(x)
+        ntok += 1
+
+        # apply position embeddings
+        x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok, dim)
+        x = self.apply_positional_embedding(x, ar_ids)
+
+        x = self.ln_pre(x)
+        x = x.view(bsz * num_concurrent_media, -1, dim)
+
+        npad, attn_mask = 0, None
+        attn_mask = build_encoder_attention_mask(x, ar_ids, ntok, num_chunks, self.config.supported_aspect_ratios)
+        x = x.transpose(0, 1).contiguous()
+        x, int_x = self.transformer(
+            hidden_states=x,
+            attention_mask=attn_mask,
+            return_intermediate=self.return_intermediate,
+        )
+
+        # [ntok * num_concurrent_media * num_chunks, bsz, hidden_size] ->  [bsz, ntok * num_concurrent_media * num_chunks, hidden_size]
+        x, int_x = x.transpose(0, 1).contiguous(), int_x.transpose(0, 1).contiguous()
+        x = self.ln_post(x)
+        x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok + npad, dim)
+        x = self.post_tile_pos_embed(x, ar_ids)
+        x = x.reshape(bsz * num_concurrent_media, num_chunks * (ntok + npad), dim)
+        x = x.transpose(0, 1).contiguous()
+        x = self.global_transformer(
+            hidden_states=x,
+            attention_mask=None,
+        )
+        x = x.transpose(0, 1)
+        x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok + npad, dim)
+
+        # adding back intermediate layer outputs
+        x = x.reshape(bsz, num_concurrent_media, num_chunks, ntok, dim)
+        int_x = int_x.reshape(bsz * num_concurrent_media, num_chunks, ntok + npad, -1)
+        # int_x = contract_num_tokens_from_mult8(int_x, npad)
+        int_x = int_x.reshape(bsz, num_concurrent_media, num_chunks, ntok, -1)
+        x = torch.cat([x, int_x], dim=-1)
+        return x
diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py
index bbd502e21c803..f210d01a06fda 100644
--- a/nemo/collections/vlm/neva/data/__init__.py
+++ b/nemo/collections/vlm/neva/data/__init__.py
@@ -14,12 +14,12 @@
 
 from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig
 from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
-from nemo.collections.vlm.neva.data.mock import MockDataModule
+from nemo.collections.vlm.neva.data.mock import MockDataModule as NevaMockDataModule
 from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken
 
 __all__ = [
     "NevaLazyDataModule",
-    "MockDataModule",
+    "NevaMockDataModule",
     "DataConfig",
     "ImageDataConfig",
     "VideoDataConfig",
diff --git a/nemo/collections/vlm/neva/data/conversation.py b/nemo/collections/vlm/neva/data/conversation.py
index 22c435cb1fd2b..d78d3bd28acb9 100644
--- a/nemo/collections/vlm/neva/data/conversation.py
+++ b/nemo/collections/vlm/neva/data/conversation.py
@@ -34,6 +34,7 @@ class SeparatorStyle(Enum):
     CHATML = auto()
     LLAMA_2 = auto()
     LLAMA_3 = auto()
+    MLLAMA = auto()
     MISTRAL = auto()
     NVGPT = auto()
     QWEN = auto()
@@ -153,6 +154,11 @@ def get_prompt(self):
             tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Meta-Llama-3-8B-Instruct"
             ret = self.process_chat_template(tokenizer_name_or_path, messages)
 
+        elif self.sep_style == SeparatorStyle.MLLAMA:
+            """ """
+            tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Llama-3.2-11B-Vision-Instruct"
+            ret = self.process_chat_template(tokenizer_name_or_path, messages)
+
         elif self.sep_style == SeparatorStyle.NVGPT:
             ret = self.sep2 + self.system + self.sep
             for role, message in messages:
@@ -458,6 +464,18 @@ def dict(self):
     stop_str="<|eot_id|>",
 )
 
+conv_mllama = Conversation(
+    system="",
+    roles=("user", "assistant"),
+    version="llama_v3_2",
+    messages=[],
+    offset=0,
+    sep="<|eot_id|>",
+    sep_style=SeparatorStyle.MLLAMA,
+    tokenizer_name_or_path="meta-llama/Llama-3.2-11B-Vision-Instruct",
+    stop_str="<|eot_id|>",
+)
+
 conv_mistral_instruct = Conversation(
     system="",
     roles=("USER", "ASSISTANT"),
@@ -648,6 +666,7 @@ def dict(self):
     "v1": conv_vicuna_v1,
     "vicuna_v1": conv_vicuna_v1,
     "llama_2": conv_llama_2,
+    "mllama": conv_mllama,
     "mistral_instruct": conv_mistral_instruct,
     "mistral_orca": conv_mistral_orca,
     "mistral_zephyr": conv_mistral_zephyr,
diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py
index ca1179e240338..57aa5b4088356 100644
--- a/nemo/collections/vlm/neva/data/lazy.py
+++ b/nemo/collections/vlm/neva/data/lazy.py
@@ -12,37 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import TYPE_CHECKING, Optional
-
-import pytorch_lightning as pl
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch.utils import data
-from torch.utils.data import DataLoader
-
-from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig
-from nemo.collections.vlm.neva.data.conversation import conv_templates as supported_conv_templates
-from nemo.lightning.pytorch.plugins import MegatronDataSampler
-
-if TYPE_CHECKING:
-    pass
-
 import json
 import logging
 import os
 import re
 import tarfile
-from typing import Any, Dict, List, Sequence
+from typing import Any, Dict, List, Optional, Sequence
 
 import decord
 import numpy as np
+import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
 from PIL import Image
-from torch.utils.data import Dataset, default_collate
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+from torch.utils.data import DataLoader, Dataset, default_collate
 from transformers import CLIPImageProcessor, SiglipImageProcessor
 
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig
+from nemo.collections.vlm.neva.data.conversation import conv_templates as supported_conv_templates
 from nemo.collections.vlm.neva.data.multimodal_tokens import IGNORE_INDEX, SPECIAL_TOKEN_MAP
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
 
 
 class TarOrFolderImageLoader:
@@ -259,6 +251,7 @@ def __init__(
         data_config,
         tokenizer,
         image_processor,
+        sequence_length,
     ):
         super().__init__()
         if data_path is not None:
@@ -270,7 +263,13 @@ def __init__(
         logging.warning("Formatting inputs...Skip in lazy mode")
         self.data_config = data_config
         self.tokenizer = tokenizer
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        if isinstance(self.tokenizer, AutoTokenizer):
+            self.tokenizer = self.tokenizer.tokenizer
+
         self.image_processor = image_processor
+        self.sequence_length = sequence_length
 
         self.conv_template = data_config.conv_template
         self.conv = supported_conv_templates[self.conv_template]
@@ -323,8 +322,13 @@ def _apply_prompt_templates(self, source, use_plain=False):
         roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 
         source = source['conversations']
-        if roles[source[0]["from"]] != conv.roles[0]:
-            source = source[1:]
+
+        def _fix_roles(roles):
+            if len(source) < 2:
+                return roles
+            return {source[0]["from"]: conv.roles[0], source[1]["from"]: conv.roles[1]}
+
+        roles = _fix_roles(roles)
 
         conv.messages = []
         for j, sentence in enumerate(source):
@@ -354,6 +358,7 @@ def _tokenize_and_label(self, conversations):
                 return_tensors="pt",
             )[0]
             answer_start, answer_end = find_pattern_indices(tokens, answer_tokens, search_start_index)
+            assert answer_start > 0, "Not found valid answer in conversation."
             labels[answer_start:answer_end] = tokens[answer_start:answer_end]
             search_start_index = answer_end
         tokens = tokens[:-1]
diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
index 7d0c53b793210..260b7e7e0f4a1 100644
--- a/nemo/collections/vlm/neva/model/base.py
+++ b/nemo/collections/vlm/neva/model/base.py
@@ -139,6 +139,9 @@ def configure_model(self) -> "MCoreMultimodalProjector":
                     ),
                 )
                 self.layer_spec = self.layer_spec.submodules
+            elif self.projector_type == "mcore_affine":
+                self.projector_type = "affine"  # strip "mcore_" for mcore init
+                self.layer_spec = MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=None)
             else:
                 raise NotImplementedError(f"Not supported projector type `{self.projector_type}`")
 
@@ -620,7 +623,6 @@ class NevaModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
     def __init__(
         self,
         config: NevaConfig,
-        # TODO: Add transformer_layer_spec when we update mcore
         optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
         model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
diff --git a/nemo/collections/vlm/peft/__init__.py b/nemo/collections/vlm/peft/__init__.py
new file mode 100644
index 0000000000000..ab0c451a7d9d3
--- /dev/null
+++ b/nemo/collections/vlm/peft/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.peft.lora import LoRA
+
+__all__ = ["LoRA"]
diff --git a/nemo/collections/vlm/peft/lora.py b/nemo/collections/vlm/peft/lora.py
new file mode 100644
index 0000000000000..1e394daa8ead6
--- /dev/null
+++ b/nemo/collections/vlm/peft/lora.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from torch import nn
+
+from nemo.collections.llm.peft.lora import LoRA as LLMLoRA
+
+
+@dataclass
+class LoRA(LLMLoRA):
+    """
+    Built on top of llm.LoRA, vlm.LoRA additionally allows the user to specify whether the language or vision
+    models should be frozen.
+    For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
+    finetune the vision model.
+
+    For detailed usage of the LoRA api, see llm.LoRA docstrings.
+
+    Example:
+    --------
+        >>> from nemo.collections import vlm
+        >>> lora = vlm.peft.LoRA(target_modules=["*.language_model.*.linear_qkv"], freeze_vision_model=False, dim=32)
+        >>> model = vlm.MLlamaModel(model_transform=lora)
+        >>> # (set up trainer and data)
+        >>> trainer.fit(model, data)
+
+    References:
+    -----------
+        Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021).
+        LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:2106.09685.
+        https://arxiv.org/abs/2106.09685
+
+    )
+
+    """
+
+    freeze_language_model: bool = True
+    freeze_vision_model: bool = False
+
+    def freeze_model(self, model: nn.Module) -> None:
+        modules = []
+        if self.freeze_language_model and model.module.module.language_model is not None:
+            modules.append(model.module.module.language_model)
+        if self.freeze_vision_model and model.module.module.vision_model is not None:
+            modules.append(model.module.module.vision_model)
+
+        for module in modules:
+            for param in module.parameters():
+                param.requires_grad = False
diff --git a/nemo/collections/vlm/recipes/__init__.py b/nemo/collections/vlm/recipes/__init__.py
new file mode 100644
index 0000000000000..2b71ecc50f8f8
--- /dev/null
+++ b/nemo/collections/vlm/recipes/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.collections.vlm.recipes import mllama_11b, mllama_90b
+
+__all__ = [
+    "mllama_11b",
+    "mllama_90b",
+]
diff --git a/nemo/collections/vlm/recipes/mllama_11b.py b/nemo/collections/vlm/recipes/mllama_11b.py
new file mode 100644
index 0000000000000..697be9990faff
--- /dev/null
+++ b/nemo/collections/vlm/recipes/mllama_11b.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+
+from nemo import lightning as nl
+from nemo.collections import llm, vlm
+from nemo.collections.llm.recipes.finetune_default import nemo_resume
+from nemo.collections.llm.recipes.log.default import tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.vlm.mllama.data.mock import MockDataModule
+
+NAME = "mllama_11b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama-3.2-Vision 11B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama-3.2-Vision 11B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mllama_11b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig11B))
+
+
+@run.cli.factory(target=llm.finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3.2 11B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mllama_11b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mllama_11b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        encoder_pipeline_model_parallel_size=0,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_val_batches=2,
+        log_every_n_steps=10,
+        max_steps=5190,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        val_check_interval=100,
+    )
+
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(),
+        trainer=trainer,
+        data=run.Config(
+            MockDataModule,
+            seq_length=4100,  # encoder (vision) seq length
+            decoder_seq_length=512,  # decoder (llm) seq length
+            global_batch_size=16,
+            micro_batch_size=2,
+            vocab_size=128256,
+            crop_size=(448, 448),
+            num_workers=0,
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=2.0e-07, warmup_steps=150),
+        resume=nemo_resume("meta-llama/Llama-3.2-11B-Vision"),
+    )
+
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 2e-05
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(
+            vlm.LoRA,
+            freeze_vision_model=False,
+            target_modules=[
+                "*.language_model.*.linear_qkv",
+                "*.language_model.*.linear_q",
+                "*.language_model.*.linear_kv",
+                "*.language_model.*.linear_proj",
+                "*.language_model.*.linear_fc1",
+                "*.language_model.*.linear_fc2",
+            ],
+        )
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    return recipe
diff --git a/nemo/collections/vlm/recipes/mllama_90b.py b/nemo/collections/vlm/recipes/mllama_90b.py
new file mode 100644
index 0000000000000..8822aa9b189fa
--- /dev/null
+++ b/nemo/collections/vlm/recipes/mllama_90b.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+
+from nemo import lightning as nl
+from nemo.collections import llm, vlm
+from nemo.collections.llm.recipes.finetune_default import nemo_resume
+from nemo.collections.llm.recipes.log.default import tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.vlm.mllama.data.mock import MockDataModule
+
+NAME = "mllama_90b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama-3.2-Vision 90B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama-3.2-Vision 90B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mllama_90b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig90B))
+
+
+@run.cli.factory(target=llm.finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3.2 90B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mllama_90b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mllama_90b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=8,
+        pipeline_model_parallel_size=1,
+        encoder_pipeline_model_parallel_size=0,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_val_batches=2,
+        log_every_n_steps=10,
+        max_steps=5190,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        val_check_interval=100,
+    )
+
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(),
+        trainer=trainer,
+        data=run.Config(
+            MockDataModule,
+            seq_length=6404,  # encoder (vision) seq length
+            decoder_seq_length=512,  # decoder (llm) seq length
+            global_batch_size=16,
+            micro_batch_size=2,
+            vocab_size=128256,
+            crop_size=(560, 560),
+            num_workers=0,
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=2.0e-07, warmup_steps=150),
+        resume=nemo_resume("meta-llama/Llama-3.2-90B-Vision"),
+    )
+
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        raise ValueError("Full finetuning recipe for Llama-3.2-90B model will be supported soon.")
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(
+            vlm.LoRA,
+            freeze_vision_model=False,
+            target_modules=[
+                "*.language_model.*.linear_qkv",
+                "*.language_model.*.linear_q",
+                "*.language_model.*.linear_kv",
+                "*.language_model.*.linear_proj",
+                "*.language_model.*.linear_fc1",
+                "*.language_model.*.linear_fc2",
+            ],
+        )
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    return recipe
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 6c7fd128e530e..9cf686464417e 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -375,6 +375,7 @@ def __init__(
         drop_last: bool = True,
         global_batch_size: Optional[int] = None,
         pad_samples_to_global_batch_size: Optional[bool] = False,
+        seed: int = 0,
     ) -> None:
         super().__init__(
             total_samples=total_samples,
@@ -389,7 +390,30 @@ def __init__(
         assert (
             not pad_samples_to_global_batch_size
         ), "`MegatronPretrainingRandomSampler` does not support sample padding"
+        if (not drop_last) and self.micro_batch_times_data_parallel_size > 1:
+            raise RuntimeError(
+                "`MegatronPretrainingRandomSampler` does not support drop_last=False when micro_batch_size * data_parallel_size > 1. \
+                  please reduce your MBS and data parallelism to 1 if you want to use drop_last=False, or switch to drop_last=True to avoid this error"
+            )
         self.last_batch_size = self.total_samples % self.micro_batch_times_data_parallel_size
+        self.seed = seed
+
+    def __len__(self):
+        active_total_samples = self.total_samples - (self.last_batch_size if self.drop_last else 0)
+        num_available_samples = active_total_samples - self.consumed_samples % active_total_samples
+        if self.global_batch_size is not None:
+            if self.drop_last:
+                num_global_batches = num_available_samples // self.global_batch_size
+            else:
+                num_global_batches = (num_available_samples + self.global_batch_size - 1) // self.global_batch_size
+            # return len of dataloader in terms of micro batches to avoid discrepancy between len of dataloader and
+            # num of batches fetched (as training step fetches in terms of micro batches)
+            return num_global_batches * (self.global_batch_size // self.micro_batch_times_data_parallel_size)
+        else:
+            if self.drop_last:
+                return num_available_samples // self.micro_batch_times_data_parallel_size
+            else:
+                return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size
 
     def __iter__(self):
         active_total_samples = self.total_samples - self.last_batch_size
@@ -404,7 +428,7 @@ def __iter__(self):
         start_idx = self.data_parallel_rank * bucket_size
 
         g = torch.Generator()
-        g.manual_seed(self.epoch)
+        g.manual_seed(self.seed + self.epoch)
         random_idx = torch.randperm(bucket_size, generator=g).tolist()
         idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
 
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 2ccb9bb1b1fe5..e699f15565bde 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -257,10 +257,12 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
 
             _base = Path(NEMO_MODELS_CACHE)
 
-        # If the useu supplied `hf:///path/to/downloaded/my-model/`
+        # If the user supplied `hf:///path/to/downloaded/my-model/`
         # then extract the last dir-name (i.e. my-model) and append it to _base
         if str(self).startswith('/'):
-            return _base / PurePath((str(self))).name
+            if self.suffix in ['.pt', '.pth']:
+                return _base / self.parent.name
+            return _base / self.name
         return _base / str(self).replace("://", "/")
 
     def on_import_ckpt(self, model: pl.LightningModule):
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 2a0e346ced2a5..6a3138b1da294 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -47,6 +47,7 @@
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.transformer_config import TransformerConfig
+from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import move_data_to_device
 from torch import Tensor, nn
 from typing_extensions import override
@@ -1040,6 +1041,7 @@ class MegatronStep(Generic[ModelT, DataT]):
         micro_batch_size (Optional[int]): Size of each micro-batch.
         seq_length (Optional[int]): Sequence length for the current step.
         num_microbatches (Optional[int]): Number of micro-batches in this step.
+        decoder_seq_length (Optional[int]): Sequence length of decoder (used only in encoder-decoder style models) for the current step.
 
     Type Parameters:
         ModelT: The type of the model being used.
@@ -1054,6 +1056,7 @@ class MegatronStep(Generic[ModelT, DataT]):
     seq_length: Optional[int] = None
     num_microbatches: Optional[int] = None
     step_i: Optional[int] = None
+    decoder_seq_length: Optional[int] = None
 
     @classmethod
     def infer(
@@ -1131,6 +1134,7 @@ def __call__(self) -> List[Any]:
             seq_length=self.seq_length,
             micro_batch_size=self.micro_batch_size,
             forward_only=self.forward_only,
+            decoder_seq_length=self.decoder_seq_length,
         )
 
     def to_data_iterator_list(
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index f37fd38adf531..024e2577c8682 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -44,8 +44,10 @@ def __init__(
         init_consumed_samples: int = 0,
         init_global_step: int = 0,
         output_log: bool = True,
+        decoder_seq_len: Optional[int] = None,
     ):
         self.seq_len = seq_len
+        self.decoder_seq_len = decoder_seq_len
         self.output_log = output_log
         self.micro_batch_size = micro_batch_size
         self.global_batch_size = global_batch_size
@@ -110,6 +112,7 @@ def on_megatron_step_start(self, step: MegatronStep) -> MegatronStep:
             seq_length=self.seq_len,
             micro_batch_size=self.micro_batch_size,
             num_microbatches=self.num_microbatches,
+            decoder_seq_length=self.decoder_seq_len,
         )
 
     def on_megatron_microbatches_start(self, step: MegatronStep) -> None:
diff --git a/tests/collections/multimodal/data/energon/test_data_module.py b/tests/collections/multimodal/data/energon/test_data_module.py
index 23edc0dd3094c..179d3f09f2df7 100644
--- a/tests/collections/multimodal/data/energon/test_data_module.py
+++ b/tests/collections/multimodal/data/energon/test_data_module.py
@@ -93,14 +93,14 @@ def test_data_module(self):
             self.assertIn('attention_mask', batch)
             print(batch)
             decoded_text = self.decode_vqa_tokens_to_text(batch['tokens'][0].tolist())
-            system_message = re.escape(self.data_module.multimodal_sample_config.conversation_template_config.system)
+            # system_message = re.escape(self.data_module.multimodal_sample_config.conversation_template_config.system)
             user_context = re.escape(self.vqa_json[0]['value'])
             assistant_answer = re.escape(self.vqa_json[1]['value'])
-            self.assertRegex(
-                decoded_text,
-                rf"{system_message}",
-                msg="System message block does not match the expected format.",
-            )
+            # self.assertRegex(
+            #     decoded_text,
+            #     rf"{system_message}",
+            #     msg="System message block does not match the expected format.",
+            # )
             self.assertRegex(decoded_text, user_context, msg="User context did not match in decoded text")
             self.assertRegex(
                 decoded_text, assistant_answer, msg="Assistant answer block did not match in decoded text"
@@ -117,14 +117,14 @@ def test_data_module(self):
             self.assertIn('attention_mask', batch)
             print(batch)
             decoded_text = self.decode_vqa_tokens_to_text(batch['tokens'][0].tolist())
-            system_message = re.escape(self.data_module.multimodal_sample_config.conversation_template_config.system)
+            # system_message = re.escape(self.data_module.multimodal_sample_config.conversation_template_config.system)
             user_context = re.escape(self.vqa_json[0]['value'])
             assistant_answer = re.escape(self.vqa_json[1]['value'])
-            self.assertRegex(
-                decoded_text,
-                rf"{system_message}",
-                msg="System message block does not match the expected format.",
-            )
+            # self.assertRegex(
+            #     decoded_text,
+            #     rf"{system_message}",
+            #     msg="System message block does not match the expected format.",
+            # )
             self.assertRegex(decoded_text, user_context, msg="User context did not match in decoded text")
             self.assertRegex(
                 decoded_text, assistant_answer, msg="Assistant answer block did not match in decoded text"