HuiyingLi
diff --git a/‎nemo/collections/multimodal/data/energon/base.py
+24-6 b/‎nemo/collections/multimodal/data/energon/base.py
+24-6
diff --git a/‎nemo/collections/multimodal/data/energon/config.py
+1-7 b/‎nemo/collections/multimodal/data/energon/config.py
+1-7
diff --git a/‎nemo/collections/multimodal/data/energon/conversation.py
+20 b/‎nemo/collections/multimodal/data/energon/conversation.py
+20
diff --git a/‎nemo/collections/multimodal/data/energon/task_encoder.py
+1-1 b/‎nemo/collections/multimodal/data/energon/task_encoder.py
+1-1
diff --git a/‎nemo/collections/vlm/__init__.py
+45-7 b/‎nemo/collections/vlm/__init__.py
+45-7
diff --git a/‎nemo/collections/vlm/mllama/__init__.py
+17 b/‎nemo/collections/vlm/mllama/__init__.py
+17
diff --git a/‎nemo/collections/vlm/mllama/data/__init__.py
+21 b/‎nemo/collections/vlm/mllama/data/__init__.py
+21
@@ -11,23 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Any, Dict, Literal, Optional
 
+from copy import deepcopy
+from typing import Any, Dict, Literal, Optional
+
+import fiddle as fdl
 import pytorch_lightning as pl
 from megatron.core import parallel_state
 from megatron.energon import WorkerConfig, get_savable_loader, get_train_dataset
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils.data import DataLoader
+from typing_extensions import Self
 
 from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
 from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder
-from nemo.lightning.io.mixin import IOMixin
+from nemo.lightning.io.mixin import IOMixin, serialization, track_io
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.utils import logging
 
-if TYPE_CHECKING:
-    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-
 
 class SimpleMultiModalDataModule(pl.LightningDataModule, IOMixin):
     """
@@ -66,6 +67,7 @@ def __init__(
         pin_memory: bool = True,
         multimodal_sample_config: Optional[MultiModalSampleConfig] = MultiModalSampleConfig(),
         task_encoder: Optional[MultiModalTaskEncoder] = None,
+        decoder_seq_length: Optional[int] = None,
     ) -> None:
         """
         Initialize the SimpleMultiModalDataModule.
@@ -87,6 +89,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.image_processor = image_processor
         self.seq_length = seq_length
+        self.decoder_seq_length = decoder_seq_length
         self.micro_batch_size = micro_batch_size
         self.global_batch_size = global_batch_size
         self.num_workers = num_workers
@@ -99,11 +102,24 @@ def __init__(
         )
         self.init_global_step = 0
         self.data_sampler = SequentialMegatronSampler(
-            seq_len=self.seq_length, micro_batch_size=self.micro_batch_size, global_batch_size=self.global_batch_size
+            seq_len=self.seq_length,
+            decoder_seq_len=self.decoder_seq_length,
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
         )
         self.train_dataloader_object = None
         self.val_dataloader_object = None
 
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        # (pleasefixme) image_processor and task_encoder are problematic with Fiddle so we skip serializing them for now
+        cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items() if k not in ['image_processor', 'task_encoder']}
+
+        for val in cfg_kwargs.values():
+            if not serialization.find_node_traverser(type(val)):
+                track_io(type(val))
+        cfg = fdl.Config(type(self), **cfg_kwargs)
+        return cfg
+
     def datasets_provider(self, worker_config, split: Literal['train', 'val'] = 'val'):
         """
         Provide the dataset for training or validation.
@@ -315,6 +331,7 @@ def __init__(
         micro_batch_size: int = 4,
         global_batch_size: int = 8,
         init_consumed_samples: int = 0,
+        decoder_seq_len: Optional[int] = None,
         init_global_step=0,
     ):
         """
@@ -328,6 +345,7 @@ def __init__(
         """
         super().__init__(
             seq_len=seq_len,
+            decoder_seq_len=decoder_seq_len,
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
             init_consumed_samples=init_consumed_samples,
 
@@ -15,7 +15,7 @@
 from dataclasses import dataclass, field
 from typing import List
 import torch
-from nemo.collections.multimodal.data.energon.conversation import BaseConversationTemplateConfig
+from nemo.collections.multimodal.data.energon.conversation import LLaVATemplateConfig
 
 
 @dataclass
@@ -56,12 +56,6 @@ class ImageTextRawBatch:
     loss_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
 
 
-class LLaVATemplateConfig(BaseConversationTemplateConfig):
-    """LLava specific template configuration which extends the base config"""
-
-    pass
-
-
 @dataclass
 class MultiModalSampleConfig:
     image_token: ImageToken = field(default_factory=ImageToken)
 
@@ -19,6 +19,15 @@
 class BaseConversationTemplateConfig:
     """Conversation template config related parameters"""
 
+    system: Optional[str] = "".format()  # fmt: off
+    roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
+    stop_string: Optional[str] = None
+    chat_template = None
+
+
+class LLaVATemplateConfig(BaseConversationTemplateConfig):
+    """LLava specific template configuration which extends the base config"""
+
     system: Optional[str] = (
         "A chat between a curious user and artificial assistant agent. The assistant gives helpful, detailed and polite answers to user's questions.".format()
     )  # fmt: off
@@ -36,3 +45,14 @@ class BaseConversationTemplateConfig:
         {%- endif %}
     {%- endfor -%}
     """
+
+
+class MLlamaTemplateConfig(BaseConversationTemplateConfig):
+    """LLava specific template configuration which extends the base config"""
+
+    system: Optional[str] = None
+    roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
+    stop_string: str = None
+    chat_template = """
+    '{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now("%d %b %Y") %}\n    {%- else %}\n        {%- set date_string = "26 Jul 2024" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- Find out if there are any images #}\n{% set image_ns = namespace(has_images=false) %}      \n{%- for message in messages %}\n    {%- for content in message[\'content\'] %}\n        {%- if content[\'type\'] == \'image\' %}\n            {%- set image_ns.has_images = true %}\n        {%- endif %}\n    {%- endfor %}\n{%- endfor %}\n\n{#- Error out if there are images and system message #}\n{%- if image_ns.has_images and not system_message == "" %}\n    {{- raise_exception("Prompting with images is incompatible with system messages.") }}\n{%- endif %}\n\n{#- System message if there are no images #}\n{%- if not image_ns.has_images %}\n    {{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n    {%- if tools is not none %}\n        {{- "Environment: ipython\\n" }}\n    {%- endif %}\n    {{- "Cutting Knowledge Date: December 2023\\n" }}\n    {{- "Today Date: " + date_string + "\\n\\n" }}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}\n        {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n        {{- "Do not use variables.\\n\\n" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- "\\n\\n" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- "<|eot_id|>" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0][\'content\']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception("Cannot put tools in the first user message when there\'s no first user message!") }}\n{%- endif %}\n    {{- \'<|start_header_id|>user<|end_header_id|>\\n\\n\' -}}\n    {{- "Given the following functions, please respond with a JSON for a function call " }}\n    {{- "with its proper arguments that best answers the given prompt.\\n\\n" }}\n    {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n    {{- "Do not use variables.\\n\\n" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- "\\n\\n" }}\n    {%- endfor %}\n    {{- first_user_message + "<|eot_id|>"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == \'ipython\' or message.role == \'tool\' or \'tool_calls\' in message) %}\n    {{- \'<|start_header_id|>\' + message[\'role\'] + \'<|end_header_id|>\\n\\n\' }}\n        {%- if message[\'content\'] is string %}\n            {{- message[\'content\'] }}\n        {%- else %}\n            {%- for content in message[\'content\'] %}\n                {%- if content[\'type\'] == \'image\' %}\n                    {{- \'<|image|>\' }}\n                {%- elif content[\'type\'] == \'text\' %}\n                    {{- content[\'text\'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \'<|eot_id|>\' }}\n    {%- elif \'tool_calls\' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception("This model only supports single tool-calls at once!") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n        {{- \'{"name": "\' + tool_call.name + \'", \' }}\n        {{- \'"parameters": \' }}\n        {{- tool_call.arguments | tojson }}\n        {{- "}" }}\n        {{- "<|eot_id|>" }}\n    {%- elif message.role == "tool" or message.role == "ipython" %}\n        {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- "<|eot_id|>" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' }}\n{%- endif %}\n'
+    """
@@ -62,7 +62,7 @@ def __init__(self, tokenizer, image_processor, multimodal_sample_config):
         image_processor (ImageProcessor): The image processor used for preprocessing images across different sample types.
         multimodal_sample_config (MultiModalSampleConfig): Configuration object for multimodal samples, including tokens and placeholders.
         """
-
+        self.tokenizer = tokenizer
         self.encoders: Dict[str, SampleEncoder] = {
             VQASample.__name__: VQASampleEncoder(
                 tokenizer=tokenizer,
 
@@ -1,28 +1,56 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.mllama.data import MLlamaLazyDataModule, MLlamaMockDataModule
+from nemo.collections.vlm.mllama.model.base import (
+    CrossAttentionTextConfig,
+    CrossAttentionVisionConfig,
+    MLlamaModel,
+    MLlamaModelConfig,
+)
+from nemo.collections.vlm.mllama.model.mllama import (
+    MLlamaConfig11B,
+    MLlamaConfig11BInstruct,
+    MLlamaConfig90B,
+    MLlamaConfig90BInstruct,
+)
 from nemo.collections.vlm.neva.data import (
     DataConfig,
     ImageDataConfig,
     ImageToken,
-    MockDataModule,
     MultiModalToken,
     NevaLazyDataModule,
+    NevaMockDataModule,
     VideoDataConfig,
     VideoToken,
 )
-from nemo.collections.vlm.neva.model import (
+from nemo.collections.vlm.neva.model.base import (
     CLIPViTConfig,
     HFCLIPVisionConfig,
-    Llava1_5Config7B,
-    Llava1_5Config13B,
-    LlavaConfig,
-    LlavaModel,
     MultimodalProjectorConfig,
     NevaConfig,
     NevaModel,
 )
+from nemo.collections.vlm.neva.model.llava import Llava1_5Config7B, Llava1_5Config13B, LlavaConfig, LlavaModel
+from nemo.collections.vlm.peft import LoRA
+from nemo.collections.vlm.recipes import *
 
 __all__ = [
-    "MockDataModule",
+    "NevaMockDataModule",
     "NevaLazyDataModule",
+    "MLlamaMockDataModule",
+    "MLlamaLazyDataModule",
     "DataConfig",
     "ImageDataConfig",
     "VideoDataConfig",
@@ -38,4 +66,14 @@
     "Llava1_5Config7B",
     "Llava1_5Config13B",
     "LlavaModel",
+    "MLlamaModel",
+    "MLlamaModelConfig",
+    "CrossAttentionTextConfig",
+    "CrossAttentionVisionConfig",
+    "MLlamaConfig11B",
+    "MLlamaConfig11BInstruct",
+    "MLlamaConfig90B",
+    "MLlamaConfig90BInstruct",
+    "mllama_11b",
+    "mllama_90b",
 ]
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers import PreTrainedTokenizerFast
+from nemo.lightning.io import track_io
+
+track_io(PreTrainedTokenizerFast)
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule
+from nemo.collections.vlm.mllama.data.mock import MockDataModule as MLlamaMockDataModule
+
+__all__ = [
+    "MLlamaMockDataModule",
+    "MLlamaLazyDataModule",
+]