NVIDIA
diff --git a/‎docs/source/multimodal/api.rst
+2-2 b/‎docs/source/multimodal/api.rst
+2-2
diff --git a/‎docs/source/multimodal/mllm/checkpoint.rst
+1-1 b/‎docs/source/multimodal/mllm/checkpoint.rst
+1-1
diff --git a/‎docs/source/multimodal/text2img/insp2p.rst
+1-1 b/‎docs/source/multimodal/text2img/insp2p.rst
+1-1
diff --git a/‎docs/source/multimodal/text2img/sd.rst
+1-1 b/‎docs/source/multimodal/text2img/sd.rst
+1-1
diff --git a/‎examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
+2-1 b/‎examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
+2-1
diff --git a/‎examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
+1-1 b/‎examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
+1-1
diff --git a/‎examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py
+11-1 b/‎examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py
+11-1
diff --git a/‎examples/multimodal/multimodal_llm/neva/eval/gradio_cli.py
+41 b/‎examples/multimodal/multimodal_llm/neva/eval/gradio_cli.py
+41
diff --git a/‎examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
+108 b/‎examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
+108
@@ -10,7 +10,7 @@ Model Classes
     :members: __init__, configure_optimizers
 
 
-.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
@@ -49,7 +49,7 @@ Modules
     :show-inheritance:
     :no-members:
 
-.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
     :show-inheritance:
     :no-members:
     :members: __init__, encode, decode
 
@@ -108,7 +108,7 @@ Adjust model parallelism with:
     --target_tensor_model_parallel_size=??? \
     --pipeline_model_parallel_size=??? \
     --target_pipeline_model_parallel_size=??? \
-    --model_class="nemo.collections.multimodal.models.neva.neva_model.MegatronNevaModel" \
+    --model_class="nemo.collections.multimodal.models.multimodal_llm.neva.neva_model.MegatronNevaModel" \
     --precision=32 \
     --tokenizer_model_path=/path/to/tokenizer.model \
     --tp_conversion_only
@@ -6,7 +6,7 @@ Model Introduction
 
 InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
 
-Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
+Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
 
 Training Dataset
 --------------------
 
@@ -33,7 +33,7 @@ The VAE configuration is defined under **first_stage_config**.
 .. code-block:: yaml
 
     first_stage_config:
-        _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+        _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
         from_pretrained: /path/to/vae.bin
         embed_dim: 4
         monitor: val/rec_loss
 
@@ -11,6 +11,7 @@ inference:
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
   images_base_path: /pwd/images
+  insert_image_token: null # `left` or `right` or `null`
 
 trainer:
   devices: 8
@@ -24,7 +25,7 @@ tensor_model_parallel_size: 8
 pipeline_model_parallel_size: 1
 pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others)
 neva_model_file: /pwd/nemo_experiments/nemo_llava.nemo #neva_22b_tp8_finetuned_v1.nemo neva_8b_tp4_finetuned_v1.nemo
-llm_model_file: null
+base_model_file: null
 checkpoint_dir: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/checkpoints # checkpoint file dir. This is used to load the PTL checkpoint generated during the Kosmos training
 checkpoint_name: null #megatron_clip--val_loss=0.41-step=13499-consumed_samples=431904.0.ckpt # PTL checkpoint file name, only used for PTL checkpoint loading
 hparams_file: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/version_0/hparams.yaml # model configuration file, only used for PTL checkpoint loading
 
@@ -209,7 +209,7 @@ model:
 
   optim:
     name: fused_adam
-    lr: 2e-5
+    lr: 2e-4
     weight_decay: 0.
     betas:
       - 0.9
 
@@ -18,7 +18,8 @@
     python convert_hf_llava_to_neva.py \
      --in-file <path_to_hf_checkpoints_folder> \
      --out-file <path_to_output_nemo_file> \
-     --tokenizer-model <path_to_sp_tokenizer_model>
+     --tokenizer-model <path_to_sp_tokenizer_model> \
+     --conv-template llama_2 # nvgpt, llama_2, v1 (vicuna)
 """
 
 import os
@@ -49,6 +50,13 @@ def get_args():
         "--in-file", type=str, default=None, required=True, help="Path to Huggingface LLaMA checkpoints",
     )
     parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--conv-template",
+        type=str,
+        default="llama_2",
+        required=False,
+        help="Conversation template: nvgpt, llama_2, v1 (vicuna)",
+    )
     parser.add_argument(
         "--tokenizer-model", type=str, default=None, required=False, help="Path to sentencepiece tokenizer model."
     )
@@ -121,6 +129,8 @@ def load_config(args, llava_config):
         nemo_config.num_query_groups = llava_config['num_key_value_heads']
     nemo_config.use_cpu_initialization = True
     nemo_config.activation = 'fast-swiglu'
+    nemo_config.data.conv_template = args.conv_template
+    nemo_config.mm_cfg.model_type = args.conv_template
     if args.tokenizer_model is None:
         nemo_config.tokenizer.model = llava_config['tokenizer_model']
     else:
 
@@ -0,0 +1,41 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+
+import requests
+
+# URL of the Gradio server
+url = 'http://localhost:8890/api/predict/'
+
+# Prepare the text data
+text_data = '<image>Describe this image please.'
+
+# Prepare the image data
+with open("/path/to/images/001.jpg", "rb") as image_file:
+    encoded_string = base64.b64encode(image_file.read()).decode()
+
+# Data to send
+data = {'data': [text_data, encoded_string]}
+
+# Sending a POST request to the Gradio server
+response = requests.post(url, json=data)
+
+# Checking if the request was successful
+if response.status_code == 200:
+    # Parsing the response
+    response_data = response.json()
+    print("Response from server:", response_data)
+else:
+    print("Failed to get a response from the server, status code:", response.status_code)
@@ -0,0 +1,108 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import io
+
+import gradio as gr
+import PIL.Image
+from omegaconf import OmegaConf
+
+from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
+
+CFG_STRING = """
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 0.2 # sampling temperature
+  add_BOS: False # add the bos token at the begining of the prompt
+  tokens_to_generate: 256 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
+  images_base_path: /pwd/images
+  insert_image_token: null # `left` or `right` or `null`
+
+cluster_type: BCP
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others)
+
+neva_model_file: /pwd/nemo_experiments/nemo_llava.nemo #neva_22b_tp8_finetuned_v1.nemo neva_8b_tp4_finetuned_v1.nemo
+base_model_file: null
+checkpoint_dir: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/checkpoints # checkpoint file dir. This is used to load the PTL checkpoint generated during the Kosmos training
+checkpoint_name: null #megatron_clip--val_loss=0.41-step=13499-consumed_samples=431904.0.ckpt # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/version_0/hparams.yaml # model configuration file, only used for PTL checkpoint loading
+"""
+
+cfg = OmegaConf.create(CFG_STRING)
+cfg.neva_model_file = "/path/to/llava-v1.5-7b.nemo"
+model, image_processor = create_neva_model_and_processor(cfg)
+
+
+def predict(prompt, image_base64=None):
+    input_data = {"prompt": prompt}
+    if image_base64 is not None:
+        image_data = base64.b64decode(image_base64)
+        # image = PIL.Image.fromarray(image)
+        image = PIL.Image.open(io.BytesIO(image_data))
+        input_data["image"] = image_processor(image)
+
+    length_params: LengthParam = {
+        "max_length": cfg.inference.tokens_to_generate,
+        "min_length": cfg.inference.min_tokens_to_generate,
+    }
+    sampling_params: SamplingParam = {
+        "use_greedy": cfg.inference.greedy,
+        "temperature": cfg.inference.temperature,
+        "top_k": cfg.inference.top_k,
+        "top_p": cfg.inference.top_p,
+        "repetition_penalty": cfg.inference.repetition_penalty,
+        "add_BOS": cfg.inference.add_BOS,
+        "all_probs": cfg.inference.all_probs,
+        "compute_logprob": cfg.inference.compute_logprob,
+        "end_strings": cfg.inference.end_strings,
+    }
+
+    # Generate model responses
+    responses = model.generate(
+        input_prompts=[input_data],  # Adjust based on your model's requirements
+        length_params=length_params,  # Define these parameters as in your original code
+        sampling_params=sampling_params,  # Define these parameters as in your original code
+        inference_config=cfg,
+    )
+
+    return responses[0]["clean_response"]
+
+
+iface = gr.Interface(
+    fn=predict,
+    inputs=[gr.Textbox(), gr.Textbox()],
+    outputs="text",
+    title="Multimodal Model Inference",
+    description="Enter a prompt and optionally upload an image for model inference.",
+)
+
+if __name__ == "__main__":
+    iface.launch(server_port=8890, share=False)