From 8768fe6176ea6cb14a08d961c9b57096c80a00ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Fri, 17 May 2024 15:17:15 +0000
Subject: [PATCH 01/43] Remove extra whitespaces

---
 examples/scripts/vsft_llava.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/scripts/vsft_llava.py b/examples/scripts/vsft_llava.py
index 85cb98d5f3c..8a26477737d 100644
--- a/examples/scripts/vsft_llava.py
+++ b/examples/scripts/vsft_llava.py
@@ -32,7 +32,7 @@
     
 # peft:
 python examples/scripts/vsft_llava.py \
-    --dataset_name="HuggingFaceH4/llava-instruct-mix-vsft" \    
+    --dataset_name="HuggingFaceH4/llava-instruct-mix-vsft" \
     --model_name_or_path="llava-hf/llava-1.5-7b-hf" \
     --report_to="wandb" \
     --learning_rate=1.4e-5 \
@@ -45,11 +45,11 @@
     --gradient_checkpointing \
     --remove_unused_columns=False \
     --torch_dtype=float16 \
-    --fp16=True \ 
+    --fp16=True \
     --use_peft=True \
     --lora_r=64 \
     --lora_alpha=16 \
-    --lora_target_modules=all-linear"
+    --lora_target_modules=all-linear
 
 # evaluation:
  

From 5d43f2b5615688f75232cdacff1c38837920f729 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Fri, 17 May 2024 18:06:42 +0000
Subject: [PATCH 02/43] idefics

---
 examples/scripts/vsft_idefics2.py | 226 ++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 examples/scripts/vsft_idefics2.py

diff --git a/examples/scripts/vsft_idefics2.py b/examples/scripts/vsft_idefics2.py
new file mode 100644
index 00000000000..f74867afc64
--- /dev/null
+++ b/examples/scripts/vsft_idefics2.py
@@ -0,0 +1,226 @@
+# flake8: noqa
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+# regular:
+python examples/scripts/vsft_idefics2.py \
+    --dataset_name=HuggingFaceH4/cord-v2 \
+    --model_name_or_path=HuggingFaceM4/idefics2-8b \
+    --report_to=wandb \
+    --learning_rate=1e-4 \
+    --per_device_train_batch_size=2 \
+    --gradient_accumulation_steps=8 \
+    --output_dir=data/vsft-idefics2 \
+    --logging_steps=5 \
+    --num_train_epochs=1 \
+    --push_to_hub \
+    --gradient_checkpointing \
+    --remove_unused_columns=False \
+    --torch_dtype=float16
+
+
+# peft:
+python examples/scripts/vsft_idefics2.py \
+    --model_name_or_path="HuggingFaceM4/idefics2-tfrm-compatible" \
+    --report_to="wandb" \
+    --learning_rate=1.4e-5 \
+    --per_device_train_batch_size=8 \
+    --gradient_accumulation_steps=1 \
+    --output_dir="data/vsft-llava-1.5-7b-hf" \
+    --logging_steps=5 \
+    --num_train_epochs=1 \
+    --push_to_hub \
+    --gradient_checkpointing \
+    --remove_unused_columns=False \
+    --torch_dtype=float16 \
+    --fp16=True \ 
+    --dataset_name=HuggingFaceH4/llava-instruct-mix-vsft \    
+    --use_peft=True \
+    --lora_r=64 \
+    --lora_alpha=16 \
+    --lora_target_modules=all-linear"
+
+# evaluation:
+ 
+To evaluate, first install the lmms-eval framework: pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+then run:
+accelerate launch --num_processes=8 -m lmms_eval \
+        --model llava_hf \
+        --model_args pretrained=llava-hf/llava-1.5-7b-hf \
+        --tasks mmbench \
+        --batch_size 1 \
+        --output_path ./logs/ \
+        --log_sample    
+"""
+import logging
+import os
+from contextlib import nullcontext
+
+TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
+
+from trl.commands.cli_utils import init_zero_verbose, TrlParser, SFTScriptArguments
+
+if TRL_USE_RICH:
+    init_zero_verbose()
+    FORMAT = "%(message)s"
+
+    from rich.console import Console
+    from rich.logging import RichHandler
+
+import torch
+from accelerate import Accelerator
+from datasets import load_dataset
+
+from tqdm.rich import tqdm
+from transformers import AutoTokenizer, AutoProcessor, Idefics2ForConditionalGeneration
+
+from trl import (
+    ModelConfig,
+    RichProgressCallback,
+    SFTTrainer,
+    SFTConfig,
+    get_peft_config,
+    get_quantization_config,
+    get_kbit_device_map,
+)
+
+tqdm.pandas()
+
+if TRL_USE_RICH:
+    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()], level=logging.INFO)
+
+
+if __name__ == "__main__":
+    parser = TrlParser((SFTScriptArguments, SFTConfig, ModelConfig))
+    args, training_args, model_config = parser.parse_args_and_config()
+    training_args.gradient_checkpointing_kwargs = dict(use_reentrant=True)
+    # Force use our print callback
+    if TRL_USE_RICH:
+        training_args.disable_tqdm = True
+        console = Console()
+
+    ################
+    # Model, Tokenizer & Processor
+    ################
+    # IDEFICS2_CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
+
+    torch_dtype = model_config.torch_dtype if model_config.torch_dtype in ["auto", None] else getattr(torch, model_config.torch_dtype)
+    quantization_config = get_quantization_config(model_config)
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        attn_implementation=model_config.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    # tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
+    # tokenizer.chat_template = IDEFICS2_CHAT_TEMPLATE
+    processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
+    # processor.tokenizer = tokenizer
+
+    # model = Idefics2ForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+
+    ######
+    from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration
+    from peft import LoraConfig
+
+    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
+    model = Idefics2ForConditionalGeneration.from_pretrained(
+        "HuggingFaceM4/idefics2-8b", torch_dtype=torch.float16, quantization_config=bnb_config
+    )
+    lora_config = LoraConfig(
+        r=8,
+        lora_alpha=8,
+        lora_dropout=0.1,
+        target_modules=".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$",
+        use_dora=False,
+        init_lora_weights="gaussian",
+    )
+    model.add_adapter(lora_config)
+    model.enable_adapters()
+    ######
+
+    ################
+    # Create a data collator to encode text and image pairs
+    ################
+
+    class Idefics2DataCollator:
+        def __init__(self, processor):
+            self.processor = processor
+
+        def __call__(self, examples):
+            texts = []
+            images = []
+            for example in examples:
+                if len(example["images"]) > 1:
+                    raise ValueError("This collator only supports one image per example")
+                messages = example["messages"]
+                text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+                texts.append(text)
+                images.append([example["images"][0]])
+
+            batch = self.processor(texts, images, return_tensors="pt", padding=True)
+
+            labels = batch["input_ids"].clone()
+            if self.processor.tokenizer.pad_token_id is not None:
+                labels[labels == self.processor.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+
+            return batch
+
+    data_collator = Idefics2DataCollator(processor)
+
+    ################
+    # Dataset
+    ################
+    raw_datasets = load_dataset(args.dataset_name)
+    train_dataset = raw_datasets["train"]
+    eval_dataset = raw_datasets["test"]
+
+    ################
+    # Optional rich context managers
+    ###############
+    init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the SFTTrainer...")
+    save_context = (
+        nullcontext()
+        if not TRL_USE_RICH
+        else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}")
+    )
+
+    ################
+    # Training
+    ################
+    with init_context:
+        trainer = SFTTrainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            dataset_text_field="text",  # need a dummy field
+            tokenizer=processor.tokenizer,
+            # peft_config=get_peft_config(model_config),
+            callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
+            data_collator=data_collator,
+            dataset_kwargs={"skip_prepare_dataset": True},
+        )
+
+    trainer.train()
+
+    with save_context:
+        trainer.save_model(training_args.output_dir)
+        trainer.push_to_hub()
+        if Accelerator().is_main_process:
+            processor.push_to_hub(training_args.hub_model_id)

From f5a3237f4ffcccd049aee53e40ef28d766a9e242 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 27 May 2024 18:09:59 +0000
Subject: [PATCH 03/43] vdpo

---
 examples/scripts/vdpo.py    | 183 ++++++++++++++++++++++++++++++++++++
 trl/trainer/model_config.py |   4 +-
 2 files changed, 185 insertions(+), 2 deletions(-)
 create mode 100644 examples/scripts/vdpo.py

diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
new file mode 100644
index 00000000000..f0510e0d7b3
--- /dev/null
+++ b/examples/scripts/vdpo.py
@@ -0,0 +1,183 @@
+# flake8: noqa
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+# regular: # OOM error
+python examples/scripts/vdpo.py \
+    --dataset_name=HuggingFaceH4/vqa_preferences \
+    --model_name_or_path=HuggingFaceM4/idefics2-8b \
+    --per_device_train_batch_size 4 \
+    --learning_rate 1e-3 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 10 \
+    --eval_steps 500 \
+    --output_dir="dpo_anthropic_hh" \
+    --warmup_steps 150 \
+    --report_to wandb \
+    --bf16 \
+    --logging_first_step \
+    --no_remove_unused_columns
+
+# peft:
+python examples/scripts/vdpo.py \
+    --dataset_name=HuggingFaceH4/vqa_preferences \
+    --model_name_or_path=HuggingFaceM4/idefics2-8b \
+    --per_device_train_batch_size 4 \
+    --learning_rate 1e-3 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 10 \
+    --eval_steps 500 \
+    --output_dir="dpo_anthropic_hh" \
+    --optim rmsprop \
+    --warmup_steps 150 \
+    --report_to wandb \
+    --bf16 \
+    --logging_first_step \
+    --no_remove_unused_columns \
+    --use_peft \
+    --load_in_4bit \
+    --lora_target_module .*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$
+
+"""
+
+import logging
+import multiprocessing
+import os
+from contextlib import nullcontext
+
+TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
+
+from trl.commands.cli_utils import DPOScriptArguments, init_zero_verbose, TrlParser
+
+if TRL_USE_RICH:
+    init_zero_verbose()
+    FORMAT = "%(message)s"
+
+    from rich.console import Console
+    from rich.logging import RichHandler
+
+import torch
+from datasets import load_dataset
+from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, AutoProcessor
+
+from trl import (
+    DPOConfig,
+    DPOTrainer,
+    ModelConfig,
+    RichProgressCallback,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+
+
+if TRL_USE_RICH:
+    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()], level=logging.INFO)
+
+
+if __name__ == "__main__":
+    parser = TrlParser((DPOScriptArguments, DPOConfig, ModelConfig))
+    args, training_args, model_config = parser.parse_args_and_config()
+
+    # Force use our print callback
+    if TRL_USE_RICH:
+        training_args.disable_tqdm = True
+        console = Console()
+
+    ################
+    # Model & Tokenizer
+    ################
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_config)
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        attn_implementation=model_config.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    model = Idefics2ForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    peft_config = get_peft_config(model_config)
+    if peft_config is None:
+        model_ref = Idefics2ForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    else:
+        model_ref = None
+    tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
+    processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if tokenizer.chat_template is None:
+        tokenizer.chat_template = "{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{{ eos_token }}"
+    if args.ignore_bias_buffers:
+        # torch distributed hack
+        model._ddp_params_and_buffers_to_ignore = [
+            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
+        ]
+
+    ################
+    # Optional rich context managers
+    ###############
+    init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the DPOTrainer...")
+    save_context = (
+        nullcontext()
+        if not TRL_USE_RICH
+        else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}")
+    )
+
+    ################
+    # Dataset
+    ################
+    ds = load_dataset(args.dataset_name)
+    if args.sanity_check:
+        for key in ds:
+            ds[key] = ds[key].select(range(50))
+
+    def process(row):
+        row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
+        return row
+
+    ds = ds.map(
+        process,
+        # num_proc=multiprocessing.cpu_count(),
+        load_from_cache_file=False,
+    )
+    train_dataset = ds[args.dataset_train_split]
+    eval_dataset = ds[args.dataset_test_split]
+
+    ################
+    # Training
+    ################
+    with init_context:
+        trainer = DPOTrainer(
+            model,
+            model_ref,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            peft_config=get_peft_config(model_config),
+            callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
+        )
+
+    trainer.train()
+
+    with save_context:
+        trainer.save_model(training_args.output_dir)
diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
index c30fa4ae497..b16a07421db 100644
--- a/trl/trainer/model_config.py
+++ b/trl/trainer/model_config.py
@@ -86,5 +86,5 @@ def __post_init__(self):
         if self.load_in_8bit and self.load_in_4bit:
             raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
 
-        if self.lora_target_modules == ["all-linear"]:
-            self.lora_target_modules = "all-linear"
+        if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
+            self.lora_target_modules = self.lora_target_modules[0]

From 682c0345c53ff4656908fbcdbc673a26c20e208e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 27 May 2024 18:17:28 +0000
Subject: [PATCH 04/43] sft idefics

---
 examples/scripts/sft_idefics.py | 113 ++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 examples/scripts/sft_idefics.py

diff --git a/examples/scripts/sft_idefics.py b/examples/scripts/sft_idefics.py
new file mode 100644
index 00000000000..6a1d5eea575
--- /dev/null
+++ b/examples/scripts/sft_idefics.py
@@ -0,0 +1,113 @@
+"""
+`CUDA_VISIBLE_DEVICES=1 python mre.py` works fine
+without bnb: `CUDA_VISIBLE_DEVICES=1 python mre.py` doesn't work (diverges)
+`accelerate launch mre.py` diverges
+
+Seems to be training without bnb that fails!
+"""
+
+import torch
+from datasets import load_dataset
+from peft import LoraConfig
+from transformers import AutoProcessor, Idefics2ForConditionalGeneration, Trainer, TrainingArguments, BitsAndBytesConfig
+from trl import get_kbit_device_map
+
+USE_QLORA = True  # QLora
+
+if __name__ == "__main__":
+    # Load the model and processor
+    model_name = "HuggingFaceM4/idefics2-8b"
+    if USE_QLORA:
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
+    else:
+        quantization_config = None
+    model = Idefics2ForConditionalGeneration.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        quantization_config=quantization_config,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+    )
+    lora_config = LoraConfig(
+        r=8,
+        lora_alpha=8,
+        lora_dropout=0.1,
+        target_modules=".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$",
+        init_lora_weights="gaussian",
+        use_dora=False if USE_QLORA else True,
+    )
+    model.add_adapter(lora_config)
+    model.enable_adapters()
+
+    processor = AutoProcessor.from_pretrained(model_name, do_image_splitting=False)
+
+    # Load a dataset
+    dataset = load_dataset("HuggingFaceH4/llava-instruct-mix-vsft")
+    # dataset = load_dataset("HuggingFaceH4/cord-v2")
+
+    # Process the dataset
+    def data_collator(examples, add_generation_prompt=False):
+        messages = [example["messages"] for example in examples]
+        images = [example["images"] for example in examples]
+        text = processor.apply_chat_template(messages, add_generation_prompt=add_generation_prompt)
+        batch = processor(text, images, return_tensors="pt", padding=True)
+        labels = batch["input_ids"].clone()
+        if processor.tokenizer.pad_token_id is not None:
+            image_token = processor.tokenizer("<image>", add_special_tokens=False).input_ids[0]
+            labels[labels == processor.tokenizer.pad_token_id] = image_token
+        batch["labels"] = labels
+        return batch
+
+    # Test before training
+    # example = dataset["test"][0]
+    # example["messages"] = example["messages"][:-1]  # remove the last message (it's the answer)
+    # example["images"][0].save("image.jpg")
+    # inputs = data_collator([example], add_generation_prompt=True)
+    # exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
+    # bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+    # generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_new_tokens=1000)
+    # generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    # for i, t in enumerate(generated_text):
+    #     print(f"{i}:\n{t}\n")
+
+    trainer = Trainer(
+        model=model,
+        args=TrainingArguments(
+            output_dir="./results",
+            per_device_train_batch_size=1,
+            gradient_accumulation_steps=8,
+            logging_steps=10,
+            num_train_epochs=1,
+            logging_dir="./logs",
+            remove_unused_columns=False,
+            max_grad_norm=1.0,
+        ),
+        train_dataset=dataset["train"],
+        data_collator=data_collator,
+    )
+
+    trainer.train()
+
+    # Save the model
+    model.save_pretrained("idefics2-8b-fst-llava-instruct-mix")
+
+    # Test after training
+    # example = dataset["test"][0]
+    # example["messages"] = example["messages"][:-1]  # remove the last message (it's the answer)
+    # inputs = data_collator([example], add_generation_prompt=True)
+    # exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
+    # bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+    # generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_new_tokens=1000)
+    # generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    # for i, t in enumerate(generated_text):
+    #     print(f"{i}:\n{t}\n")
+
+
+# accelerate launch python sft_idefics.py
+# OK
+
+# Issues:
+
+# python mre.py
+# TypeError: DynamicCache.__init__() takes 1 positional argument but 2 were given
+
+# python mre.py with LORA and no QLORA, diverges (all numbers of devices)

From bf01bf306a804b5d0aba34ac268bf2b85d8518f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Thu, 30 May 2024 18:25:18 +0000
Subject: [PATCH 05/43] pad with test

---
 tests/test_utils.py  | 60 ++++++++++++++++++++++++++++++++++
 trl/trainer/utils.py | 78 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 128 insertions(+), 10 deletions(-)
 create mode 100644 tests/test_utils.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000000..e50e383db7f
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,60 @@
+import unittest
+
+import torch
+
+from trl.trainer.utils import pad
+
+
+class TestPad(unittest.TestCase):
+    def test_pad_1_dim_left(self):
+        x = torch.tensor([1, 2, 3])
+        y = torch.tensor([4, 5])
+        output = pad((x, y), padding_value=0, padding_side="left")
+        expected = torch.tensor([[1, 2, 3], [0, 4, 5]])
+        self.assertTrue(torch.equal(output, expected))
+
+    def test_pad_1_dim_right(self):
+        x = torch.tensor([1, 2, 3])
+        y = torch.tensor([4, 5])
+        output = pad((x, y), padding_value=0, padding_side="right")
+        expected = torch.tensor([[1, 2, 3], [4, 5, 0]])
+        self.assertTrue(torch.equal(output, expected))
+
+    def test_pad_2_dim_left(self):
+        x = torch.tensor([[1, 2], [3, 4]])
+        y = torch.tensor([[5, 6]])
+        output = pad((x, y), padding_value=0, padding_side="left")
+        expected = torch.tensor(
+            [
+                [[1, 2], [3, 4]],
+                [[0, 0], [5, 6]],
+            ]
+        )
+        self.assertTrue(torch.equal(output, expected))
+
+    def test_pad_2_dim_right(self):
+        x = torch.tensor([[1, 2], [3, 4]])
+        y = torch.tensor([[5, 6]])
+        output = pad((x, y), padding_value=0, padding_side="right")
+        expected = torch.tensor(
+            [
+                [[1, 2], [3, 4]],
+                [[5, 6], [0, 0]],
+            ]
+        )
+        self.assertTrue(torch.equal(output, expected))
+
+    def test_pad_2_dim_right_multidim(self):
+        x = torch.tensor([[1, 2], [3, 4]])
+        y = torch.tensor([[5]])
+        output = pad((x, y), padding_value=0, padding_side="right")
+        expected = torch.tensor(
+            [
+                [[1, 2], [3, 4]],
+                [[5, 0], [0, 0]],
+            ]
+        )
+        self.assertTrue(torch.equal(output, expected))
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index d959e7edff8..ec0065def35 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -296,6 +296,55 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         return batch
 
 
+
+def pad(tensors: List[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
+    """
+    Pads a list of tensors to the same shape along the first dimension.
+
+    Args:
+        tensors (`List[torch.Tensor]`):
+            List of input tensors to pad.
+        padding_value (`int`):
+            Value to use for padding. Default is 0.
+        padding_side (`str`):
+            Side on which to add padding. Must be 'left' or 'right'. Default is 'right'.
+
+    Returns:
+        `torch.Tensor`:
+            A single tensor containing the padded tensors.
+    
+    Examples:
+        >>> import torch
+        >>> pad([torch.tensor([1, 2, 3]), torch.tensor([4, 5])])
+        tensor([[1, 2, 3],
+                [4, 5, 0]])
+        >>> pad([torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6]])])
+        tensor([[[1, 2],
+                [3, 4]],
+
+                [[5, 6],
+                [0, 0]]])
+    """
+    # Determine the maximum shape for each dimension
+    output_shape = np.max([t.shape for t in tensors], 0).tolist()
+
+    # Create an output tensor filled with the padding value
+    output = torch.full((len(tensors), *output_shape), padding_value, dtype=tensors[0].dtype, device=tensors[0].device)
+
+    for i, t in enumerate(tensors):
+        # Determine the slice for the sequence dimension
+        if padding_side == "left":
+            seq_slice = slice(output_shape[0] - t.shape[0], output_shape[0])
+        elif padding_side == "right":
+            seq_slice = slice(0, t.shape[0])
+        else:
+            raise ValueError("padding_side must be 'left' or 'right'")
+
+        slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
+        output[i][slices] = t
+
+    return output
+
 @dataclass
 class DPODataCollatorWithPadding:
     r"""
@@ -317,7 +366,7 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         # first, pad everything to the same length
         padded_batch = {}
         for k in features[0].keys():
-            if k.endswith("_input_ids") or k.endswith("_attention_mask") or k.endswith("_labels"):
+            if k.endswith(("_input_ids", "_attention_mask", "_labels", "_pixel_values")):
                 if self.is_encoder_decoder:
                     to_pad = [torch.LongTensor(ex[k]) for ex in features]
 
@@ -337,11 +386,7 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                         raise ValueError(f"Unexpected key in batch '{k}'")
                     padded_batch[k] = pad_sequence(to_pad, batch_first=True, padding_value=padding_value)
                 else:
-                    # adapted from https://stackoverflow.com/questions/73256206
-                    if "prompt" in k:
-                        to_pad = [torch.LongTensor(ex[k][::-1]) for ex in features]
-                    else:
-                        to_pad = [torch.LongTensor(ex[k]) for ex in features]
+                    # Set padding value based on the key
                     if k.endswith("_input_ids"):
                         if self.pad_token_id is None:
                             raise ValueError(
@@ -354,13 +399,26 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                         padding_value = self.label_pad_token_id
                     elif k.endswith("_attention_mask"):
                         padding_value = 0
+                    elif k.endswith("_pixel_values"):
+                        padding_value = 0  # TODO: check if this is correct
                     else:
                         raise ValueError(f"Unexpected key in batch '{k}'")
 
-                    padded_batch[k] = pad_sequence(to_pad, batch_first=True, padding_value=padding_value)
-                    # for the prompt, flip back so padding is on left side
-                    if "prompt" in k:
-                        padded_batch[k] = padded_batch[k].flip(dims=[1])
+                    # Set padding side based on the key
+                    if k in ["prompt_input_ids", "prompt_attention_mask"]:
+                        padding_side = "left"
+                    else:
+                        padding_side = "right"
+
+                    # Set the dtype
+                    if k.endswith("_pixel_values"):
+                        dtype = torch.float32
+                    else:
+                        dtype = torch.int64
+
+                    # Convert to tensor and pad
+                    to_pad = [torch.tensor(ex[k], dtype=dtype) for ex in features]
+                    padded_batch[k] = pad(to_pad, padding_value=padding_value, padding_side=padding_side)
             elif k.endswith("_logps"):
                 # the cached reference model logprobs
                 padded_batch[k] = torch.tensor([ex[k] for ex in features])

From aed1aebddd9b9247ea9915654ad50a7b2a06d46d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Thu, 30 May 2024 18:25:45 +0000
Subject: [PATCH 06/43] use prompt instead of tokenizer

---
 examples/scripts/vdpo.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index f0510e0d7b3..da9ddf9972e 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -130,6 +130,9 @@
         model._ddp_params_and_buffers_to_ignore = [
             name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
         ]
+    processor.pad_token_id = tokenizer.pad_token_id
+    processor.bos_token_id = tokenizer.bos_token_id # needed for DPOTrainer
+    processor.eos_token_id = tokenizer.eos_token_id # needed for DPOTrainer
 
     ################
     # Optional rich context managers
@@ -150,6 +153,7 @@
             ds[key] = ds[key].select(range(50))
 
     def process(row):
+        row["prompt"] = processor.apply_chat_template(row["prompt"], tokenize=False)
         row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
         row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
         return row
@@ -172,7 +176,7 @@ def process(row):
             args=training_args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
+            tokenizer=processor,
             peft_config=get_peft_config(model_config),
             callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
         )

From e814f88f50e0abaf24056428dfb14c828d988f34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Thu, 30 May 2024 18:28:31 +0000
Subject: [PATCH 07/43] rm name main

---
 tests/test_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index e50e383db7f..cd8d7871f74 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -56,5 +56,3 @@ def test_pad_2_dim_right_multidim(self):
         )
         self.assertTrue(torch.equal(output, expected))
 
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file

From fd5d71b8801a4c4cc0af139477c8ae480a8327a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Thu, 30 May 2024 18:38:46 +0000
Subject: [PATCH 08/43] support vlm in tokenize row

---
 trl/trainer/dpo_trainer.py | 61 ++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 36ce79d3a4d..5aa57cf1b75 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -314,6 +314,12 @@ def make_inputs_require_grad(module, input, output):
         else:
             self.is_encoder_decoder = args.is_encoder_decoder
 
+        if model is not None:
+            self.is_vision_model = model.config.model_type in ["idefics2"]  # TODO: find a better way to check if its a vision model
+        else:
+            warnings.warn("No model provided, cannot determine if it is a vision model. Setting is_vision_model to False.")
+            self.is_vision_model = False
+
         self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
         if model_adapter_name is not None:
             warnings.warn(
@@ -489,9 +495,9 @@ def make_inputs_require_grad(module, input, output):
         # see: https://github.com/huggingface/trl/pull/1255
         with PartialState().local_main_process_first():
             # tokenize the dataset
-            train_dataset = train_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc)
+            train_dataset = train_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, load_from_cache_file=False)
             if eval_dataset is not None:
-                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc)
+                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, load_from_cache_file=False)
 
         super().__init__(
             model=model,
@@ -663,16 +669,22 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
 
         return super().get_eval_dataloader(eval_dataset=eval_dataset)
 
-    def build_tokenized_answer(self, prompt, answer):
+    def build_tokenized_answer(self, prompt, answer, images=None):
         """
         Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`.
         It does ensure `enc(a + b) = enc(a) + enc(a + b)[len(enc(a)):]`.
         Reference:
             https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
         """
-
-        full_tokenized = self.tokenizer(prompt + answer, add_special_tokens=False)
-        prompt_input_ids = self.tokenizer(prompt, add_special_tokens=False)["input_ids"]
+        if self.is_vision_model:
+            if answer.count("<image>") > 0:
+                raise NotImplementedError("Answer contains <image> token, which is not supported yet.")
+            full_tokenized = self.tokenizer(prompt + answer, images=images, add_special_tokens=False)
+            full_tokenized = {k: v[0] for k, v in full_tokenized.items()}  # Unbatch, not done when using idefics
+            prompt_input_ids = self.tokenizer(prompt, images=images, add_special_tokens=False)["input_ids"][0]
+        else:
+            full_tokenized = self.tokenizer(prompt + answer, add_special_tokens=False)
+            prompt_input_ids = self.tokenizer(prompt, add_special_tokens=False)["input_ids"]
 
         answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
         answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
@@ -706,12 +718,23 @@ def build_tokenized_answer(self, prompt, answer):
         answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
         answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
 
-        return dict(
-            prompt_input_ids=prompt_input_ids,
-            prompt_attention_mask=prompt_attention_mask,
-            input_ids=answer_input_ids,
-            attention_mask=answer_attention_mask,
-        )
+        if self.is_vision_model:
+            return dict(
+                prompt_input_ids=prompt_input_ids,
+                prompt_attention_mask=prompt_attention_mask,
+                prompt_pixel_values=full_tokenized["pixel_values"],
+                prompt_pixel_attention_mask=full_tokenized["pixel_attention_mask"],
+                input_ids=answer_input_ids,
+                attention_mask=answer_attention_mask,
+            )
+        else:
+            return dict(
+                prompt_input_ids=prompt_input_ids,
+                prompt_attention_mask=prompt_attention_mask,
+                input_ids=answer_input_ids,
+                attention_mask=answer_attention_mask,
+                pixel_value=full_tokenized
+            )
 
     def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> Dict:
         """Tokenize a single row from a DPO specific dataset.
@@ -728,6 +751,8 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module
         prompt = feature["prompt"]
         chosen = feature["chosen"]
         rejected = feature["rejected"]
+        if self.is_vision_model:
+            images = feature["images"]
 
         if not self.is_encoder_decoder:
             # Check issues below for more details
@@ -737,16 +762,22 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module
 
             if not isinstance(prompt, str):
                 raise ValueError(f"prompt should be an str but got {type(prompt)}")
-            prompt_tokens = self.tokenizer(prompt, add_special_tokens=False)
+            if self.is_vision_model:
+                prompt_tokens = self.tokenizer(prompt, images=images, add_special_tokens=False)
+                prompt_tokens = {k: v[0] for k, v in prompt_tokens.items()}  # Unbatch, not done when using idefics
+            else:
+                prompt_tokens = self.tokenizer(prompt, add_special_tokens=False)
+
             prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
 
             if not isinstance(chosen, str):
                 raise ValueError(f"chosen should be an str but got {type(chosen)}")
-            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen, images)
 
             if not isinstance(rejected, str):
                 raise ValueError(f"rejected should be an str but got {type(rejected)}")
-            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected, images)
 
             # Last prompt token might get merged by tokenizer and
             # it should not be included for generation if that happens

From e1b87552e2579ffdceae38a10e23e39e2ddb5c8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Thu, 30 May 2024 18:39:32 +0000
Subject: [PATCH 09/43] temp fix for regex in lora_target_module

---
 trl/trainer/model_config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
index b16a07421db..6bb50031ed8 100644
--- a/trl/trainer/model_config.py
+++ b/trl/trainer/model_config.py
@@ -86,5 +86,7 @@ def __post_init__(self):
         if self.load_in_8bit and self.load_in_4bit:
             raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
 
+        # if self.lora_target_modules == ["all-linear"]:
+        #     self.lora_target_modules = "all-linear"
         if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
             self.lora_target_modules = self.lora_target_modules[0]

From 8075419b7dbd788fa75494f32165cd56939d0121 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Fri, 31 May 2024 13:54:38 +0000
Subject: [PATCH 10/43] format

---
 trl/trainer/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index ec0065def35..8d9adf43370 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -312,7 +312,7 @@ def pad(tensors: List[torch.Tensor], padding_value: int = 0, padding_side: str =
     Returns:
         `torch.Tensor`:
             A single tensor containing the padded tensors.
-    
+
     Examples:
         >>> import torch
         >>> pad([torch.tensor([1, 2, 3]), torch.tensor([4, 5])])

From 1b815c2888b2e0328ca05457e2f6a8672a6ff65c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Fri, 31 May 2024 18:14:50 +0000
Subject: [PATCH 11/43] vdpo

---
 examples/scripts/vdpo.py | 57 +++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index da9ddf9972e..f5bcd326f2e 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -31,24 +31,22 @@
 
 # peft:
 python examples/scripts/vdpo.py \
-    --dataset_name=HuggingFaceH4/vqa_preferences \
-    --model_name_or_path=HuggingFaceM4/idefics2-8b \
-    --per_device_train_batch_size 4 \
-    --learning_rate 1e-3 \
-    --gradient_accumulation_steps 1 \
-    --logging_steps 10 \
-    --eval_steps 500 \
-    --output_dir="dpo_anthropic_hh" \
-    --optim rmsprop \
-    --warmup_steps 150 \
+    --dataset_name HuggingFaceH4/vqa_preferences \
+    --model_name_or_path HuggingFaceM4/idefics2-8b \
+    --per_device_train_batch_size 8 \
+    --learning_rate 1e-5 \
+    --gradient_accumulation_steps 8 \
+    --logging_steps 5 \
+    --output_dir dpo_idefics \
+    --warmup_steps 10 \
     --report_to wandb \
     --bf16 \
+    --torch_dtype bfloat16 \
     --logging_first_step \
     --no_remove_unused_columns \
     --use_peft \
-    --load_in_4bit \
+    --dataloader_num_workers 8
     --lora_target_module .*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$
-
 """
 
 import logging
@@ -59,7 +57,7 @@
 TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
 
 from trl.commands.cli_utils import DPOScriptArguments, init_zero_verbose, TrlParser
-
+from accelerate import PartialState
 if TRL_USE_RICH:
     init_zero_verbose()
     FORMAT = "%(message)s"
@@ -69,7 +67,7 @@
 
 import torch
 from datasets import load_dataset
-from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, AutoProcessor
+from transformers import AutoModelForVision2Seq, AutoProcessor
 
 from trl import (
     DPOConfig,
@@ -98,11 +96,7 @@
     ################
     # Model & Tokenizer
     ################
-    torch_dtype = (
-        model_config.torch_dtype
-        if model_config.torch_dtype in ["auto", None]
-        else getattr(torch, model_config.torch_dtype)
-    )
+    torch_dtype = model_config.torch_dtype if model_config.torch_dtype in ["auto", None] else getattr(torch, model_config.torch_dtype)
     quantization_config = get_quantization_config(model_config)
     model_kwargs = dict(
         revision=model_config.model_revision,
@@ -113,26 +107,26 @@
         device_map=get_kbit_device_map() if quantization_config is not None else None,
         quantization_config=quantization_config,
     )
-    model = Idefics2ForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     peft_config = get_peft_config(model_config)
     if peft_config is None:
-        model_ref = Idefics2ForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+        model_ref = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     else:
         model_ref = None
-    tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
     processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
+    tokenizer = processor.tokenizer
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     if tokenizer.chat_template is None:
         tokenizer.chat_template = "{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{{ eos_token }}"
     if args.ignore_bias_buffers:
         # torch distributed hack
-        model._ddp_params_and_buffers_to_ignore = [
-            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
-        ]
+        model._ddp_params_and_buffers_to_ignore = [name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool]
+    
+    # DPOTrainer needs the processor to have these attributes
     processor.pad_token_id = tokenizer.pad_token_id
-    processor.bos_token_id = tokenizer.bos_token_id # needed for DPOTrainer
-    processor.eos_token_id = tokenizer.eos_token_id # needed for DPOTrainer
+    processor.bos_token_id = tokenizer.bos_token_id
+    processor.eos_token_id = tokenizer.eos_token_id
 
     ################
     # Optional rich context managers
@@ -157,12 +151,9 @@ def process(row):
         row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
         row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
         return row
-
-    ds = ds.map(
-        process,
-        # num_proc=multiprocessing.cpu_count(),
-        load_from_cache_file=False,
-    )
+    
+    with PartialState().local_main_process_first():
+        ds = ds.map(process, num_proc=multiprocessing.cpu_count())
     train_dataset = ds[args.dataset_train_split]
     eval_dataset = ds[args.dataset_test_split]
 

From 6d6a1946c094fc31e58b16b018443360d1b69470 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 3 Jun 2024 09:45:08 +0000
Subject: [PATCH 12/43] tmp float16 hard code

---
 trl/trainer/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index 8d9adf43370..605e228e435 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -412,7 +412,7 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
 
                     # Set the dtype
                     if k.endswith("_pixel_values"):
-                        dtype = torch.float32
+                        dtype = torch.bfloat16  # TODO: tmp fix
                     else:
                         dtype = torch.int64
 

From 1935d3dee73b8ce5c794efc0e94b99c5c86f83e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 3 Jun 2024 09:45:40 +0000
Subject: [PATCH 13/43] concatenated_forward support for vision

---
 trl/trainer/dpo_trainer.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 5aa57cf1b75..4f67524d8e1 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -495,9 +495,9 @@ def make_inputs_require_grad(module, input, output):
         # see: https://github.com/huggingface/trl/pull/1255
         with PartialState().local_main_process_first():
             # tokenize the dataset
-            train_dataset = train_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, load_from_cache_file=False)
+            train_dataset = train_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, writer_batch_size=10)
             if eval_dataset is not None:
-                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, load_from_cache_file=False)
+                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, writer_batch_size=10)
 
         super().__init__(
             model=model,
@@ -935,6 +935,7 @@ def compute_reference_log_probs(self, padded_batch: Dict) -> Dict:
     def concatenated_inputs(
         batch: Dict[str, Union[List, torch.LongTensor]],
         is_encoder_decoder: bool = False,
+        is_vision_model: bool = False,
         label_pad_token_id: int = -100,
         padding_value: int = 0,
         device: Optional[torch.device] = None,
@@ -991,6 +992,9 @@ def concatenated_inputs(
                 batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
             )
 
+        if is_vision_model:
+            concatenated_batch["pixel_values"] = batch["prompt_pixel_values"].repeat(2, 1, 1, 1, 1).to(device=device)
+            concatenated_batch["pixel_attention_mask"] = batch["prompt_pixel_attention_mask"].repeat(2, 1, 1, 1).to(device=device)
         return concatenated_batch
 
     def dpo_loss(
@@ -1147,20 +1151,23 @@ def concatenated_forward(
         concatenated_batch = self.concatenated_inputs(
             batch,
             is_encoder_decoder=self.is_encoder_decoder,
+            is_vision_model=self.is_vision_model,
             label_pad_token_id=self.label_pad_token_id,
             padding_value=self.padding_value,
             device=self.accelerator.device,
         )
         len_chosen = batch["chosen_labels"].shape[0]
 
-        model_kwargs = (
-            {
-                "labels": concatenated_batch["concatenated_labels"],
-                "decoder_input_ids": concatenated_batch.pop("concatenated_decoder_input_ids", None),
-            }
-            if self.is_encoder_decoder
-            else {}
-        )
+        model_kwargs = {}
+
+        if self.is_encoder_decoder:
+            model_kwargs["labels"] = concatenated_batch["concatenated_labels"]
+            model_kwargs["decoder_input_ids"] = concatenated_batch.pop("concatenated_decoder_input_ids", None)
+
+        if self.is_vision_model:
+            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
+            model_kwargs["pixel_attention_mask"] = concatenated_batch["pixel_attention_mask"]
+
         all_logits = model(
             concatenated_batch["concatenated_input_ids"],
             attention_mask=concatenated_batch["concatenated_attention_mask"],

From bdc2b955bc17452c753cdc47ab203692e0e1f5df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 17 Jun 2024 13:06:26 +0000
Subject: [PATCH 14/43] style and new command line

---
 examples/scripts/vdpo.py   | 48 ++++++++++++++------------------------
 tests/test_utils.py        |  1 -
 trl/trainer/dpo_trainer.py | 16 +++++++++----
 trl/trainer/utils.py       |  2 +-
 4 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index f5bcd326f2e..19c2f7417d5 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -13,40 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-# regular: # OOM error
-python examples/scripts/vdpo.py \
-    --dataset_name=HuggingFaceH4/vqa_preferences \
-    --model_name_or_path=HuggingFaceM4/idefics2-8b \
-    --per_device_train_batch_size 4 \
-    --learning_rate 1e-3 \
-    --gradient_accumulation_steps 1 \
-    --logging_steps 10 \
-    --eval_steps 500 \
-    --output_dir="dpo_anthropic_hh" \
-    --warmup_steps 150 \
-    --report_to wandb \
-    --bf16 \
-    --logging_first_step \
-    --no_remove_unused_columns
-
-# peft:
-python examples/scripts/vdpo.py \
-    --dataset_name HuggingFaceH4/vqa_preferences \
+accelerate launch examples/scripts/vdpo.py \
+    --dataset_name HuggingFaceH4/rlaif-v_formatted \
     --model_name_or_path HuggingFaceM4/idefics2-8b \
-    --per_device_train_batch_size 8 \
+    --per_device_train_batch_size 1 \
     --learning_rate 1e-5 \
-    --gradient_accumulation_steps 8 \
     --logging_steps 5 \
-    --output_dir dpo_idefics \
-    --warmup_steps 10 \
-    --report_to wandb \
+    --output_dir dpo_idefics_rlaif-v \
     --bf16 \
     --torch_dtype bfloat16 \
     --logging_first_step \
     --no_remove_unused_columns \
+    --sanity_check \
     --use_peft \
-    --dataloader_num_workers 8
-    --lora_target_module .*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$
+    --lora_target_modules=all-linear
 """
 
 import logging
@@ -58,6 +38,7 @@
 
 from trl.commands.cli_utils import DPOScriptArguments, init_zero_verbose, TrlParser
 from accelerate import PartialState
+
 if TRL_USE_RICH:
     init_zero_verbose()
     FORMAT = "%(message)s"
@@ -96,8 +77,13 @@
     ################
     # Model & Tokenizer
     ################
-    torch_dtype = model_config.torch_dtype if model_config.torch_dtype in ["auto", None] else getattr(torch, model_config.torch_dtype)
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
     quantization_config = get_quantization_config(model_config)
+
     model_kwargs = dict(
         revision=model_config.model_revision,
         trust_remote_code=model_config.trust_remote_code,
@@ -121,8 +107,10 @@
         tokenizer.chat_template = "{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{{ eos_token }}"
     if args.ignore_bias_buffers:
         # torch distributed hack
-        model._ddp_params_and_buffers_to_ignore = [name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool]
-    
+        model._ddp_params_and_buffers_to_ignore = [
+            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
+        ]
+
     # DPOTrainer needs the processor to have these attributes
     processor.pad_token_id = tokenizer.pad_token_id
     processor.bos_token_id = tokenizer.bos_token_id
@@ -151,7 +139,7 @@ def process(row):
         row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
         row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
         return row
-    
+
     with PartialState().local_main_process_first():
         ds = ds.map(process, num_proc=multiprocessing.cpu_count())
     train_dataset = ds[args.dataset_train_split]
diff --git a/tests/test_utils.py b/tests/test_utils.py
index cd8d7871f74..5e5c3ec9c9b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -55,4 +55,3 @@ def test_pad_2_dim_right_multidim(self):
             ]
         )
         self.assertTrue(torch.equal(output, expected))
-
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 4f67524d8e1..5af7703d69e 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -315,9 +315,11 @@ def make_inputs_require_grad(module, input, output):
             self.is_encoder_decoder = args.is_encoder_decoder
 
         if model is not None:
-            self.is_vision_model = model.config.model_type in ["idefics2"]  # TODO: find a better way to check if its a vision model
+            self.is_vision_model = model.config.model_type in ["idefics2"]  # TODO: find a better way
         else:
-            warnings.warn("No model provided, cannot determine if it is a vision model. Setting is_vision_model to False.")
+            warnings.warn(
+                "No model provided, cannot determine if it is a vision model. Setting is_vision_model to False."
+            )
             self.is_vision_model = False
 
         self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
@@ -497,7 +499,9 @@ def make_inputs_require_grad(module, input, output):
             # tokenize the dataset
             train_dataset = train_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, writer_batch_size=10)
             if eval_dataset is not None:
-                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, writer_batch_size=10)
+                eval_dataset = eval_dataset.map(
+                    self.tokenize_row, num_proc=self.dataset_num_proc, writer_batch_size=10
+                )
 
         super().__init__(
             model=model,
@@ -733,7 +737,7 @@ def build_tokenized_answer(self, prompt, answer, images=None):
                 prompt_attention_mask=prompt_attention_mask,
                 input_ids=answer_input_ids,
                 attention_mask=answer_attention_mask,
-                pixel_value=full_tokenized
+                pixel_value=full_tokenized,
             )
 
     def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> Dict:
@@ -994,7 +998,9 @@ def concatenated_inputs(
 
         if is_vision_model:
             concatenated_batch["pixel_values"] = batch["prompt_pixel_values"].repeat(2, 1, 1, 1, 1).to(device=device)
-            concatenated_batch["pixel_attention_mask"] = batch["prompt_pixel_attention_mask"].repeat(2, 1, 1, 1).to(device=device)
+            concatenated_batch["pixel_attention_mask"] = (
+                batch["prompt_pixel_attention_mask"].repeat(2, 1, 1, 1).to(device=device)
+            )
         return concatenated_batch
 
     def dpo_loss(
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index 605e228e435..c2a25b0fb1f 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -296,7 +296,6 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         return batch
 
 
-
 def pad(tensors: List[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
     """
     Pads a list of tensors to the same shape along the first dimension.
@@ -345,6 +344,7 @@ def pad(tensors: List[torch.Tensor], padding_value: int = 0, padding_side: str =
 
     return output
 
+
 @dataclass
 class DPODataCollatorWithPadding:
     r"""

From 24b08f51bfe66005e1656ee9a0f9e8d1d0579617 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 17 Jun 2024 13:13:04 +0000
Subject: [PATCH 15/43] all-linear

---
 trl/trainer/model_config.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
index 6bb50031ed8..b16a07421db 100644
--- a/trl/trainer/model_config.py
+++ b/trl/trainer/model_config.py
@@ -86,7 +86,5 @@ def __post_init__(self):
         if self.load_in_8bit and self.load_in_4bit:
             raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
 
-        # if self.lora_target_modules == ["all-linear"]:
-        #     self.lora_target_modules = "all-linear"
         if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
             self.lora_target_modules = self.lora_target_modules[0]

From c5ff8d71eab03af332d8640b2902b9a3e07b0925 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 18 Jun 2024 07:52:54 +0000
Subject: [PATCH 16/43] format

---
 examples/scripts/sft_idefics.py   | 14 ++++++++++++--
 examples/scripts/vdpo.py          | 10 +++++++++-
 examples/scripts/vsft_idefics2.py | 16 ++++++++++------
 examples/scripts/vsft_llava.py    | 12 +++++-------
 4 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/examples/scripts/sft_idefics.py b/examples/scripts/sft_idefics.py
index 6a1d5eea575..e942d2e34db 100644
--- a/examples/scripts/sft_idefics.py
+++ b/examples/scripts/sft_idefics.py
@@ -9,16 +9,26 @@
 import torch
 from datasets import load_dataset
 from peft import LoraConfig
-from transformers import AutoProcessor, Idefics2ForConditionalGeneration, Trainer, TrainingArguments, BitsAndBytesConfig
+from transformers import (
+    AutoProcessor,
+    BitsAndBytesConfig,
+    Idefics2ForConditionalGeneration,
+    Trainer,
+    TrainingArguments,
+)
+
 from trl import get_kbit_device_map
 
+
 USE_QLORA = True  # QLora
 
 if __name__ == "__main__":
     # Load the model and processor
     model_name = "HuggingFaceM4/idefics2-8b"
     if USE_QLORA:
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
+        )
     else:
         quantization_config = None
     model = Idefics2ForConditionalGeneration.from_pretrained(
diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index 19c2f7417d5..7f752ab4dde 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -17,14 +17,17 @@
     --dataset_name HuggingFaceH4/rlaif-v_formatted \
     --model_name_or_path HuggingFaceM4/idefics2-8b \
     --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 16 \
     --learning_rate 1e-5 \
     --logging_steps 5 \
     --output_dir dpo_idefics_rlaif-v \
+    --push_to_hub --hub_model_id HuggingFaceH4/idefics2-8b-dpo-rlaif-v \
     --bf16 \
     --torch_dtype bfloat16 \
     --logging_first_step \
     --no_remove_unused_columns \
-    --sanity_check \
+    --dataset_num_proc 50 \
+    --dataload_num_workers 16 \
     --use_peft \
     --lora_target_modules=all-linear
 """
@@ -138,6 +141,11 @@ def process(row):
         row["prompt"] = processor.apply_chat_template(row["prompt"], tokenize=False)
         row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
         row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
+        for idx, img in enumerate(row["images"]):  # Resize image so that the largest side is 640
+            ratio = min(1.0, 640 / max(img.size))
+            new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+            row["images"][idx] = img.resize(new_size)
+        row["images"] = row["images"]
         return row
 
     with PartialState().local_main_process_first():
diff --git a/examples/scripts/vsft_idefics2.py b/examples/scripts/vsft_idefics2.py
index f74867afc64..c5516ed6b94 100644
--- a/examples/scripts/vsft_idefics2.py
+++ b/examples/scripts/vsft_idefics2.py
@@ -15,19 +15,19 @@
 """
 # regular:
 python examples/scripts/vsft_idefics2.py \
-    --dataset_name=HuggingFaceH4/cord-v2 \
     --model_name_or_path=HuggingFaceM4/idefics2-8b \
+    --dataset_name=HuggingFaceH4/cord-v2 \
     --report_to=wandb \
-    --learning_rate=1e-4 \
-    --per_device_train_batch_size=2 \
-    --gradient_accumulation_steps=8 \
+    --learning_rate=2.0e-5 \
+    --per_device_train_batch_size=8 \
+    --gradient_accumulation_steps=1 \
     --output_dir=data/vsft-idefics2 \
     --logging_steps=5 \
     --num_train_epochs=1 \
     --push_to_hub \
     --gradient_checkpointing \
     --remove_unused_columns=False \
-    --torch_dtype=float16
+    --torch_dtype=bfloat16
 
 
 # peft:
@@ -115,7 +115,11 @@
     ################
     # IDEFICS2_CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
 
-    torch_dtype = model_config.torch_dtype if model_config.torch_dtype in ["auto", None] else getattr(torch, model_config.torch_dtype)
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
     quantization_config = get_quantization_config(model_config)
     model_kwargs = dict(
         revision=model_config.model_revision,
diff --git a/examples/scripts/vsft_llava.py b/examples/scripts/vsft_llava.py
index 8a26477737d..59753927faf 100644
--- a/examples/scripts/vsft_llava.py
+++ b/examples/scripts/vsft_llava.py
@@ -18,9 +18,9 @@
     --dataset_name="HuggingFaceH4/llava-instruct-mix-vsft" \
     --model_name_or_path="llava-hf/llava-1.5-7b-hf" \
     --report_to="wandb" \
-    --learning_rate=1.4e-5 \
-    --per_device_train_batch_size=8 \
-    --gradient_accumulation_steps=1 \
+    --learning_rate=1.4e-7 \
+    --per_device_train_batch_size=2 \
+    --gradient_accumulation_steps=32 \
     --output_dir="data/vsft-llava-1.5-7b-hf" \
     --logging_steps=5 \
     --num_train_epochs=1 \
@@ -28,16 +28,15 @@
     --gradient_checkpointing \
     --remove_unused_columns=False \
     --torch_dtype=float16 \
-    --fp16=True
     
 # peft:
 python examples/scripts/vsft_llava.py \
-    --dataset_name="HuggingFaceH4/llava-instruct-mix-vsft" \
+    --dataset_name="HuggingFaceH4/llava-instruct-mix" \
     --model_name_or_path="llava-hf/llava-1.5-7b-hf" \
     --report_to="wandb" \
     --learning_rate=1.4e-5 \
     --per_device_train_batch_size=8 \
-    --gradient_accumulation_steps=1 \
+    --gradient_accumulation_steps=128 \
     --output_dir="data/vsft-llava-1.5-7b-hf" \
     --logging_steps=5 \
     --num_train_epochs=1 \
@@ -45,7 +44,6 @@
     --gradient_checkpointing \
     --remove_unused_columns=False \
     --torch_dtype=float16 \
-    --fp16=True \
     --use_peft=True \
     --lora_r=64 \
     --lora_alpha=16 \

From a7d17327df6d1b76d64f213ddd335be2407b0160 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 18 Jun 2024 07:55:21 +0000
Subject: [PATCH 17/43] delete old examples

---
 examples/scripts/sft_idefics.py   | 123 ----------------
 examples/scripts/vsft_idefics2.py | 230 ------------------------------
 2 files changed, 353 deletions(-)
 delete mode 100644 examples/scripts/sft_idefics.py
 delete mode 100644 examples/scripts/vsft_idefics2.py

diff --git a/examples/scripts/sft_idefics.py b/examples/scripts/sft_idefics.py
deleted file mode 100644
index e942d2e34db..00000000000
--- a/examples/scripts/sft_idefics.py
+++ /dev/null
@@ -1,123 +0,0 @@
-"""
-`CUDA_VISIBLE_DEVICES=1 python mre.py` works fine
-without bnb: `CUDA_VISIBLE_DEVICES=1 python mre.py` doesn't work (diverges)
-`accelerate launch mre.py` diverges
-
-Seems to be training without bnb that fails!
-"""
-
-import torch
-from datasets import load_dataset
-from peft import LoraConfig
-from transformers import (
-    AutoProcessor,
-    BitsAndBytesConfig,
-    Idefics2ForConditionalGeneration,
-    Trainer,
-    TrainingArguments,
-)
-
-from trl import get_kbit_device_map
-
-
-USE_QLORA = True  # QLora
-
-if __name__ == "__main__":
-    # Load the model and processor
-    model_name = "HuggingFaceM4/idefics2-8b"
-    if USE_QLORA:
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
-        )
-    else:
-        quantization_config = None
-    model = Idefics2ForConditionalGeneration.from_pretrained(
-        model_name,
-        torch_dtype=torch.float16,
-        quantization_config=quantization_config,
-        device_map=get_kbit_device_map() if quantization_config is not None else None,
-    )
-    lora_config = LoraConfig(
-        r=8,
-        lora_alpha=8,
-        lora_dropout=0.1,
-        target_modules=".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$",
-        init_lora_weights="gaussian",
-        use_dora=False if USE_QLORA else True,
-    )
-    model.add_adapter(lora_config)
-    model.enable_adapters()
-
-    processor = AutoProcessor.from_pretrained(model_name, do_image_splitting=False)
-
-    # Load a dataset
-    dataset = load_dataset("HuggingFaceH4/llava-instruct-mix-vsft")
-    # dataset = load_dataset("HuggingFaceH4/cord-v2")
-
-    # Process the dataset
-    def data_collator(examples, add_generation_prompt=False):
-        messages = [example["messages"] for example in examples]
-        images = [example["images"] for example in examples]
-        text = processor.apply_chat_template(messages, add_generation_prompt=add_generation_prompt)
-        batch = processor(text, images, return_tensors="pt", padding=True)
-        labels = batch["input_ids"].clone()
-        if processor.tokenizer.pad_token_id is not None:
-            image_token = processor.tokenizer("<image>", add_special_tokens=False).input_ids[0]
-            labels[labels == processor.tokenizer.pad_token_id] = image_token
-        batch["labels"] = labels
-        return batch
-
-    # Test before training
-    # example = dataset["test"][0]
-    # example["messages"] = example["messages"][:-1]  # remove the last message (it's the answer)
-    # example["images"][0].save("image.jpg")
-    # inputs = data_collator([example], add_generation_prompt=True)
-    # exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
-    # bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
-    # generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_new_tokens=1000)
-    # generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    # for i, t in enumerate(generated_text):
-    #     print(f"{i}:\n{t}\n")
-
-    trainer = Trainer(
-        model=model,
-        args=TrainingArguments(
-            output_dir="./results",
-            per_device_train_batch_size=1,
-            gradient_accumulation_steps=8,
-            logging_steps=10,
-            num_train_epochs=1,
-            logging_dir="./logs",
-            remove_unused_columns=False,
-            max_grad_norm=1.0,
-        ),
-        train_dataset=dataset["train"],
-        data_collator=data_collator,
-    )
-
-    trainer.train()
-
-    # Save the model
-    model.save_pretrained("idefics2-8b-fst-llava-instruct-mix")
-
-    # Test after training
-    # example = dataset["test"][0]
-    # example["messages"] = example["messages"][:-1]  # remove the last message (it's the answer)
-    # inputs = data_collator([example], add_generation_prompt=True)
-    # exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
-    # bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
-    # generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_new_tokens=1000)
-    # generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    # for i, t in enumerate(generated_text):
-    #     print(f"{i}:\n{t}\n")
-
-
-# accelerate launch python sft_idefics.py
-# OK
-
-# Issues:
-
-# python mre.py
-# TypeError: DynamicCache.__init__() takes 1 positional argument but 2 were given
-
-# python mre.py with LORA and no QLORA, diverges (all numbers of devices)
diff --git a/examples/scripts/vsft_idefics2.py b/examples/scripts/vsft_idefics2.py
deleted file mode 100644
index c5516ed6b94..00000000000
--- a/examples/scripts/vsft_idefics2.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# flake8: noqa
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-# regular:
-python examples/scripts/vsft_idefics2.py \
-    --model_name_or_path=HuggingFaceM4/idefics2-8b \
-    --dataset_name=HuggingFaceH4/cord-v2 \
-    --report_to=wandb \
-    --learning_rate=2.0e-5 \
-    --per_device_train_batch_size=8 \
-    --gradient_accumulation_steps=1 \
-    --output_dir=data/vsft-idefics2 \
-    --logging_steps=5 \
-    --num_train_epochs=1 \
-    --push_to_hub \
-    --gradient_checkpointing \
-    --remove_unused_columns=False \
-    --torch_dtype=bfloat16
-
-
-# peft:
-python examples/scripts/vsft_idefics2.py \
-    --model_name_or_path="HuggingFaceM4/idefics2-tfrm-compatible" \
-    --report_to="wandb" \
-    --learning_rate=1.4e-5 \
-    --per_device_train_batch_size=8 \
-    --gradient_accumulation_steps=1 \
-    --output_dir="data/vsft-llava-1.5-7b-hf" \
-    --logging_steps=5 \
-    --num_train_epochs=1 \
-    --push_to_hub \
-    --gradient_checkpointing \
-    --remove_unused_columns=False \
-    --torch_dtype=float16 \
-    --fp16=True \ 
-    --dataset_name=HuggingFaceH4/llava-instruct-mix-vsft \    
-    --use_peft=True \
-    --lora_r=64 \
-    --lora_alpha=16 \
-    --lora_target_modules=all-linear"
-
-# evaluation:
- 
-To evaluate, first install the lmms-eval framework: pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
-then run:
-accelerate launch --num_processes=8 -m lmms_eval \
-        --model llava_hf \
-        --model_args pretrained=llava-hf/llava-1.5-7b-hf \
-        --tasks mmbench \
-        --batch_size 1 \
-        --output_path ./logs/ \
-        --log_sample    
-"""
-import logging
-import os
-from contextlib import nullcontext
-
-TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
-
-from trl.commands.cli_utils import init_zero_verbose, TrlParser, SFTScriptArguments
-
-if TRL_USE_RICH:
-    init_zero_verbose()
-    FORMAT = "%(message)s"
-
-    from rich.console import Console
-    from rich.logging import RichHandler
-
-import torch
-from accelerate import Accelerator
-from datasets import load_dataset
-
-from tqdm.rich import tqdm
-from transformers import AutoTokenizer, AutoProcessor, Idefics2ForConditionalGeneration
-
-from trl import (
-    ModelConfig,
-    RichProgressCallback,
-    SFTTrainer,
-    SFTConfig,
-    get_peft_config,
-    get_quantization_config,
-    get_kbit_device_map,
-)
-
-tqdm.pandas()
-
-if TRL_USE_RICH:
-    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()], level=logging.INFO)
-
-
-if __name__ == "__main__":
-    parser = TrlParser((SFTScriptArguments, SFTConfig, ModelConfig))
-    args, training_args, model_config = parser.parse_args_and_config()
-    training_args.gradient_checkpointing_kwargs = dict(use_reentrant=True)
-    # Force use our print callback
-    if TRL_USE_RICH:
-        training_args.disable_tqdm = True
-        console = Console()
-
-    ################
-    # Model, Tokenizer & Processor
-    ################
-    # IDEFICS2_CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
-
-    torch_dtype = (
-        model_config.torch_dtype
-        if model_config.torch_dtype in ["auto", None]
-        else getattr(torch, model_config.torch_dtype)
-    )
-    quantization_config = get_quantization_config(model_config)
-    model_kwargs = dict(
-        revision=model_config.model_revision,
-        trust_remote_code=model_config.trust_remote_code,
-        attn_implementation=model_config.attn_implementation,
-        torch_dtype=torch_dtype,
-        use_cache=False,
-        device_map=get_kbit_device_map() if quantization_config is not None else None,
-        quantization_config=quantization_config,
-    )
-    # tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
-    # tokenizer.chat_template = IDEFICS2_CHAT_TEMPLATE
-    processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
-    # processor.tokenizer = tokenizer
-
-    # model = Idefics2ForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
-
-    ######
-    from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration
-    from peft import LoraConfig
-
-    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
-    model = Idefics2ForConditionalGeneration.from_pretrained(
-        "HuggingFaceM4/idefics2-8b", torch_dtype=torch.float16, quantization_config=bnb_config
-    )
-    lora_config = LoraConfig(
-        r=8,
-        lora_alpha=8,
-        lora_dropout=0.1,
-        target_modules=".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$",
-        use_dora=False,
-        init_lora_weights="gaussian",
-    )
-    model.add_adapter(lora_config)
-    model.enable_adapters()
-    ######
-
-    ################
-    # Create a data collator to encode text and image pairs
-    ################
-
-    class Idefics2DataCollator:
-        def __init__(self, processor):
-            self.processor = processor
-
-        def __call__(self, examples):
-            texts = []
-            images = []
-            for example in examples:
-                if len(example["images"]) > 1:
-                    raise ValueError("This collator only supports one image per example")
-                messages = example["messages"]
-                text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
-                texts.append(text)
-                images.append([example["images"][0]])
-
-            batch = self.processor(texts, images, return_tensors="pt", padding=True)
-
-            labels = batch["input_ids"].clone()
-            if self.processor.tokenizer.pad_token_id is not None:
-                labels[labels == self.processor.tokenizer.pad_token_id] = -100
-            batch["labels"] = labels
-
-            return batch
-
-    data_collator = Idefics2DataCollator(processor)
-
-    ################
-    # Dataset
-    ################
-    raw_datasets = load_dataset(args.dataset_name)
-    train_dataset = raw_datasets["train"]
-    eval_dataset = raw_datasets["test"]
-
-    ################
-    # Optional rich context managers
-    ###############
-    init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the SFTTrainer...")
-    save_context = (
-        nullcontext()
-        if not TRL_USE_RICH
-        else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}")
-    )
-
-    ################
-    # Training
-    ################
-    with init_context:
-        trainer = SFTTrainer(
-            model=model,
-            args=training_args,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            dataset_text_field="text",  # need a dummy field
-            tokenizer=processor.tokenizer,
-            # peft_config=get_peft_config(model_config),
-            callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
-            data_collator=data_collator,
-            dataset_kwargs={"skip_prepare_dataset": True},
-        )
-
-    trainer.train()
-
-    with save_context:
-        trainer.save_model(training_args.output_dir)
-        trainer.push_to_hub()
-        if Accelerator().is_main_process:
-            processor.push_to_hub(training_args.hub_model_id)

From 2303c40364feb41f14845ba5885d3b7338b6a0ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 18 Jun 2024 09:19:51 +0000
Subject: [PATCH 18/43] get image

---
 trl/trainer/dpo_trainer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 5af7703d69e..05611ac66ce 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -755,8 +755,7 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module
         prompt = feature["prompt"]
         chosen = feature["chosen"]
         rejected = feature["rejected"]
-        if self.is_vision_model:
-            images = feature["images"]
+        images = feature.get("images")
 
         if not self.is_encoder_decoder:
             # Check issues below for more details

From b606190082f9ac8e7fe3973468707d5cd8fae742 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 18 Jun 2024 09:25:11 +0000
Subject: [PATCH 19/43] upcast

---
 trl/trainer/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index c2a25b0fb1f..ef68ee57a60 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -412,7 +412,7 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
 
                     # Set the dtype
                     if k.endswith("_pixel_values"):
-                        dtype = torch.bfloat16  # TODO: tmp fix
+                        dtype = torch.float32  # will be downcasted if necessary by the Trainer
                     else:
                         dtype = torch.int64
 

From 4f78ee57fc67062185f15e36240b568c4d94a49f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 18 Jun 2024 17:07:59 +0000
Subject: [PATCH 20/43] new test

---
 tests/my_new_test.py | 107 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 tests/my_new_test.py

diff --git a/tests/my_new_test.py b/tests/my_new_test.py
new file mode 100644
index 00000000000..24c31267d83
--- /dev/null
+++ b/tests/my_new_test.py
@@ -0,0 +1,107 @@
+import torch
+from datasets import Dataset
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from PIL import Image
+from trl import DPOConfig, DPOTrainer
+import datasets
+
+
+# Get the model
+model_id = "trl-internal-testing/tiny-random-idefics2"
+model = AutoModelForVision2Seq.from_pretrained(model_id)
+ref_model = AutoModelForVision2Seq.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+# Get the training args
+training_args = DPOConfig(
+    output_dir=".",
+    per_device_train_batch_size=2,
+    max_steps=3,
+    remove_unused_columns=False,
+    gradient_accumulation_steps=1,
+    learning_rate=9e-1,
+    evaluation_strategy="steps",
+    beta=0.1,
+    loss_type="sigmoid",
+    precompute_ref_log_probs=True,
+)
+
+dummy_dataset_dict = {
+    "images": [
+        [Image.new("RGB", (100, 100), color="black")],
+        [Image.new("RGB", (133, 100), color="red")],
+        [Image.new("RGB", (100, 133), color="green")],
+        [Image.new("RGB", (133, 133), color="blue")],
+        [Image.new("RGB", (200, 50), color="yellow")],
+        [Image.new("RGB", (50, 200), color="magenta")],
+        [Image.new("RGB", (200, 200), color="cyan")],
+        # [Image.new("RGB", (50, 50), color="white")],
+        # [Image.new("RGB", (100, 100), color="orange")],
+    ],
+    "prompt": [
+        "<image> hello",
+        "<image> how are you",
+        "<image> What is your name?",
+        "<image> What is your name?",
+        "<image> Which is the best programming language?",
+        "<image> Which is the best programming language?",
+        "<image> Which is the best programming language?",
+        # "[INST] How is the stock price? [/INST]",
+        # "[INST] How is the stock price? [/INST] ",
+    ],
+    "chosen": [
+        "hi nice to meet you",
+        "I am fine",
+        "My name is Mary",
+        "My name is Mary",
+        "Python",
+        "Python",
+        "Python",
+        # "$46 as of 10am EST",
+        # "46 as of 10am EST",
+    ],
+    "rejected": [
+        "leave me alone",
+        "I am not fine",
+        "Whats it to you?",
+        "I dont have a name",
+        "Javascript",
+        "C++",
+        "Java",
+        # " $46 as of 10am EST",
+        # " 46 as of 10am EST",
+    ],
+}
+
+features = datasets.Features(
+    {
+        "images": datasets.Sequence(datasets.Image(decode=True)),  # datasets still handles badly sequence of images
+        "prompt": datasets.Value("string"),
+        "chosen": datasets.Value("string"),
+        "rejected": datasets.Value("string"),
+    }
+)
+dataset = Dataset.from_dict(dummy_dataset_dict, features=features)
+
+
+trainer = DPOTrainer(
+    model=model,
+    ref_model=ref_model,
+    args=training_args,
+    tokenizer=processor,
+    train_dataset=dataset,
+    eval_dataset=dataset,
+)
+
+previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+trainer.train()
+
+assert trainer.state.log_history[-1]["train_loss"] is not None
+
+# check the params have changed
+for n, param in previous_trainable_params.items():
+    new_param = trainer.model.get_parameter(n)
+    # check the params have changed - ignore 0 biases
+    if param.sum() != 0:
+        assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

From c4433c0fdda5485fd0dfcd854eb2bdbfdc228184 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 18 Jun 2024 17:09:41 +0000
Subject: [PATCH 21/43] modified test

---
 tests/test_dpo_trainer.py | 113 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 2 deletions(-)

diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index a2aac261f51..f00e2cf2a98 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -18,8 +18,8 @@
 from datasets import Dataset
 from parameterized import parameterized
 from pytest import mark
-from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
-
+from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
+from PIL import Image
 from trl import DPOConfig, DPOTrainer
 
 from .testing_utils import require_bitsandbytes, require_no_wandb, require_peft
@@ -40,6 +40,12 @@ def setUpClass(cls):
         cls.t5_ref_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
         cls.t5_tokenizer = AutoTokenizer.from_pretrained(model_id)
 
+        # get idefics2 model
+        model_id = "trl-internal-testing/tiny-random-idefics2"
+        cls.idefics2_model = AutoModelForVision2Seq.from_pretrained(model_id)
+        cls.idefics2_ref_model = AutoModelForVision2Seq.from_pretrained(model_id)
+        cls.idefics2_processor = AutoProcessor.from_pretrained(model_id)
+
     def _init_dummy_dataset(self):
         # fmt: off
         dummy_dataset_dict = {
@@ -80,6 +86,57 @@ def _init_dummy_dataset(self):
         # fmt: on
         return Dataset.from_dict(dummy_dataset_dict)
 
+    def _init_dummy_image_dataset(self):
+        # fmt: off
+        dummy_dataset_dict = {
+            "images": [
+                [Image.new("RGB", (100, 100), color="black")],
+                [Image.new("RGB", (133, 100), color="red")],
+                [Image.new("RGB", (100, 133), color="green")],
+                [Image.new("RGB", (133, 133), color="blue")],
+                [Image.new("RGB", (200, 50), color="yellow")],
+                [Image.new("RGB", (50, 200), color="magenta")],
+                [Image.new("RGB", (200, 200), color="cyan")],
+                [Image.new("RGB", (50, 50), color="white")],
+                [Image.new("RGB", (100, 100), color="orange")],
+            ],
+            "prompt": [
+                "hello",
+                "how are you",
+                "What is your name?",
+                "What is your name?",
+                "Which is the best programming language?",
+                "Which is the best programming language?",
+                "Which is the best programming language?",
+                "[INST] How is the stock price? [/INST]",
+                "[INST] How is the stock price? [/INST] ",
+            ],
+            "chosen": [
+                "hi nice to meet you",
+                "I am fine",
+                "My name is Mary",
+                "My name is Mary",
+                "Python",
+                "Python",
+                "Python",
+                "$46 as of 10am EST",
+                "46 as of 10am EST",
+            ],
+            "rejected": [
+                "leave me alone",
+                "I am not fine",
+                "Whats it to you?",
+                "I dont have a name",
+                "Javascript",
+                "C++",
+                "Java",
+                " $46 as of 10am EST",
+                " 46 as of 10am EST",
+            ],
+        }
+        # fmt: on
+        return Dataset.from_dict(dummy_dataset_dict)
+
     @parameterized.expand(
         [
             ["gpt2", "sigmoid", True],
@@ -147,6 +204,58 @@ def test_dpo_trainer(self, name, loss_type, pre_compute):
                 if param.sum() != 0:
                     assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)
 
+
+    @parameterized.expand(
+        [
+            ["sigmoid", True],
+        ]
+    )
+    def test_vdpo_trainer(self, loss_type, pre_compute):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = DPOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                remove_unused_columns=False,
+                gradient_accumulation_steps=1,
+                learning_rate=9e-1,
+                evaluation_strategy="steps",
+                beta=0.1,
+                loss_type=loss_type,
+                precompute_ref_log_probs=pre_compute,
+            )
+
+            dummy_dataset = self._init_dummy_image_dataset()
+
+            model = self.idefics2_model
+            ref_model = self.idefics2_ref_model
+            processor = self.idefics2_processor
+
+            processor.pad_token_id = processor.tokenizer.pad_token_id
+
+            trainer = DPOTrainer(
+                model=model,
+                ref_model=ref_model,
+                args=training_args,
+                tokenizer=processor,
+                train_dataset=dummy_dataset,
+                eval_dataset=dummy_dataset,
+            )
+
+            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+            trainer.train()
+
+            assert trainer.state.log_history[-1]["train_loss"] is not None
+
+            # check the params have changed
+            for n, param in previous_trainable_params.items():
+                new_param = trainer.model.get_parameter(n)
+                # check the params have changed - ignore 0 biases
+                if param.sum() != 0:
+                    assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)
+
+
     def test_dpo_trainer_without_providing_ref_model(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = DPOConfig(

From 7a8a94fafe1adfa4a34637cd293e8ee1c82b487b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 18 Jun 2024 17:12:14 +0000
Subject: [PATCH 22/43] new strat for tokenizer

---
 trl/trainer/dpo_trainer.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 05611ac66ce..ba1c002e63a 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -321,6 +321,12 @@ def make_inputs_require_grad(module, input, output):
                 "No model provided, cannot determine if it is a vision model. Setting is_vision_model to False."
             )
             self.is_vision_model = False
+        
+        if self.is_vision_model:
+            self.processor = tokenizer
+            self.tokenizer = tokenizer.tokenizer # tokenizer is actually a processor at this point
+        else:
+            self.tokenizer = tokenizer
 
         self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
         if model_adapter_name is not None:
@@ -407,7 +413,7 @@ def make_inputs_require_grad(module, input, output):
             args.label_pad_token_id = label_pad_token_id
         if data_collator is None:
             data_collator = DPODataCollatorWithPadding(
-                pad_token_id=tokenizer.pad_token_id,
+                pad_token_id=self.tokenizer.pad_token_id,
                 label_pad_token_id=args.label_pad_token_id,
                 is_encoder_decoder=self.is_encoder_decoder,
             )
@@ -443,7 +449,7 @@ def make_inputs_require_grad(module, input, output):
                 "You passed `padding_value` to the DPOTrainer, the value you passed will override the one in the `DPOConfig`."
             )
             args.padding_value = padding_value
-        self.padding_value = args.padding_value if padding_value is not None else tokenizer.pad_token_id
+        self.padding_value = args.padding_value if padding_value is not None else self.tokenizer.pad_token_id
         self.max_prompt_length = args.max_prompt_length
         if truncation_mode != "keep_end":
             warnings.warn(
@@ -452,7 +458,6 @@ def make_inputs_require_grad(module, input, output):
             args.truncation_mode = truncation_mode
         self.truncation_mode = args.truncation_mode
         self.max_target_length = args.max_target_length
-        self.tokenizer = tokenizer
         self.precompute_ref_log_probs = args.precompute_ref_log_probs
 
         # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
@@ -683,9 +688,9 @@ def build_tokenized_answer(self, prompt, answer, images=None):
         if self.is_vision_model:
             if answer.count("<image>") > 0:
                 raise NotImplementedError("Answer contains <image> token, which is not supported yet.")
-            full_tokenized = self.tokenizer(prompt + answer, images=images, add_special_tokens=False)
+            full_tokenized = self.processor(prompt + answer, images=images, add_special_tokens=False)
             full_tokenized = {k: v[0] for k, v in full_tokenized.items()}  # Unbatch, not done when using idefics
-            prompt_input_ids = self.tokenizer(prompt, images=images, add_special_tokens=False)["input_ids"][0]
+            prompt_input_ids = self.processor(prompt, images=images, add_special_tokens=False)["input_ids"][0]
         else:
             full_tokenized = self.tokenizer(prompt + answer, add_special_tokens=False)
             prompt_input_ids = self.tokenizer(prompt, add_special_tokens=False)["input_ids"]
@@ -766,7 +771,7 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module
             if not isinstance(prompt, str):
                 raise ValueError(f"prompt should be an str but got {type(prompt)}")
             if self.is_vision_model:
-                prompt_tokens = self.tokenizer(prompt, images=images, add_special_tokens=False)
+                prompt_tokens = self.processor(prompt, images=images, add_special_tokens=False)
                 prompt_tokens = {k: v[0] for k, v in prompt_tokens.items()}  # Unbatch, not done when using idefics
             else:
                 prompt_tokens = self.tokenizer(prompt, add_special_tokens=False)

From 995571058b1eaf1592ef1bc74c0890756979b785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 10:27:11 +0000
Subject: [PATCH 23/43] rm token transfer

---
 examples/scripts/vdpo.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index 7f752ab4dde..2d3fa4d3eba 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -114,11 +114,6 @@
             name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
         ]
 
-    # DPOTrainer needs the processor to have these attributes
-    processor.pad_token_id = tokenizer.pad_token_id
-    processor.bos_token_id = tokenizer.bos_token_id
-    processor.eos_token_id = tokenizer.eos_token_id
-
     ################
     # Optional rich context managers
     ###############
@@ -169,6 +164,6 @@ def process(row):
         )
 
     trainer.train()
-
+    trainer.push_to_hub
     with save_context:
         trainer.save_model(training_args.output_dir)

From f6ee370f1016bb6118cc8bfca8a8982ca6fa98ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 10:52:45 +0000
Subject: [PATCH 24/43] integrate vision in dpo example

---
 examples/scripts/dpo.py | 55 +++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
index 56a11d9dc3e..1094de950b1 100644
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@@ -48,6 +48,19 @@
     --use_peft \
     --lora_r=16 \
     --lora_alpha=16
+
+# vision with peft:
+accelerate launch examples/scripts/dpo.py \
+    --dataset_name HuggingFaceH4/rlaif-v_formatted \
+    --model_name_or_path HuggingFaceM4/idefics2-8b \
+    --output_dir dpo_idefics_rlaif-v \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --learning_rate 1e-5 \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --use_peft \
+    --lora_target_modules=all-linear
 """
 
 import logging
@@ -58,6 +71,7 @@
 TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
 
 from trl.commands.cli_utils import DPOScriptArguments, init_zero_verbose, TrlParser
+from accelerate import PartialState
 
 if TRL_USE_RICH:
     init_zero_verbose()
@@ -68,7 +82,7 @@
 
 import torch
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
 
 from trl import (
     DPOConfig,
@@ -112,13 +126,25 @@
         device_map=get_kbit_device_map() if quantization_config is not None else None,
         quantization_config=quantization_config,
     )
-    model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    is_vision_model = model_config.model_name_or_path in ['HuggingFaceM4/idefics2-8b']
+    if is_vision_model:
+        model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     peft_config = get_peft_config(model_config)
     if peft_config is None:
-        model_ref = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+        if is_vision_model:
+            model_ref = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+        else:
+            model_ref = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     else:
         model_ref = None
-    tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
+    if is_vision_model:
+        processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=True)
+        tokenizer = processor.tokenizer
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
+    processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     if tokenizer.chat_template is None:
@@ -148,16 +174,19 @@
             ds[key] = ds[key].select(range(50))
 
     def process(row):
-        row["prompt"] = tokenizer.apply_chat_template(row["chosen"][:-1], tokenize=False)
-        row["chosen"] = tokenizer.apply_chat_template([row["chosen"][-1]], tokenize=False)
-        row["rejected"] = tokenizer.apply_chat_template([row["rejected"][-1]], tokenize=False)
+        row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
+        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
+        if "images" in row:
+            for idx, img in enumerate(row["images"]):  # Resize image so that the largest side is 640
+                ratio = min(1.0, 640 / max(img.size))
+                new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+                row["images"][idx] = img.resize(new_size)
+            row["images"] = row["images"]
         return row
 
-    ds = ds.map(
-        process,
-        num_proc=multiprocessing.cpu_count(),
-        load_from_cache_file=False,
-    )
+    with PartialState().local_main_process_first():
+        ds = ds.map(process, num_proc=multiprocessing.cpu_count())
     train_dataset = ds[args.dataset_train_split]
     eval_dataset = ds[args.dataset_test_split]
 
@@ -171,7 +200,7 @@ def process(row):
             args=training_args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
+            tokenizer=processor if is_vision_model else tokenizer,
             peft_config=get_peft_config(model_config),
             callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
         )

From 56fb036e4900505f38264d2630a1b4128e4c8a69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 10:53:35 +0000
Subject: [PATCH 25/43] format

---
 examples/scripts/dpo.py    |  2 +-
 tests/test_dpo_trainer.py  | 15 ++++++++++-----
 trl/trainer/dpo_trainer.py |  4 ++--
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
index 1094de950b1..b845187e295 100644
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@@ -126,7 +126,7 @@
         device_map=get_kbit_device_map() if quantization_config is not None else None,
         quantization_config=quantization_config,
     )
-    is_vision_model = model_config.model_name_or_path in ['HuggingFaceM4/idefics2-8b']
+    is_vision_model = model_config.model_name_or_path in ["HuggingFaceM4/idefics2-8b"]
     if is_vision_model:
         model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     else:
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index 5c1ded8875e..b151c4fa8b8 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -17,10 +17,17 @@
 import torch
 from datasets import Dataset
 from parameterized import parameterized
-from pytest import mark
-from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
 from PIL import Image
-from trl import DPOConfig, DPOTrainer, FDivergenceType
+from pytest import mark
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    AutoTokenizer,
+)
+
+from trl import DPOConfig, DPOTrainer
 
 from .testing_utils import require_bitsandbytes, require_no_wandb, require_peft
 
@@ -207,7 +214,6 @@ def test_dpo_trainer(self, name, loss_type, pre_compute):
                 if param.sum() != 0:
                     assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)
 
-
     @parameterized.expand(
         [
             ["sigmoid", True],
@@ -258,7 +264,6 @@ def test_vdpo_trainer(self, loss_type, pre_compute):
                 if param.sum() != 0:
                     assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)
 
-
     def test_dpo_trainer_without_providing_ref_model(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = DPOConfig(
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 147bb965963..d2fa213db0f 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -323,10 +323,10 @@ def make_inputs_require_grad(module, input, output):
                 "No model provided, cannot determine if it is a vision model. Setting is_vision_model to False."
             )
             self.is_vision_model = False
-        
+
         if self.is_vision_model:
             self.processor = tokenizer
-            self.tokenizer = tokenizer.tokenizer # tokenizer is actually a processor at this point
+            self.tokenizer = tokenizer.tokenizer  # tokenizer is actually a processor at this point
         else:
             self.tokenizer = tokenizer
 

From c3249e52460de9930e9b3b5391786c1e6b24f0af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 12:38:47 +0000
Subject: [PATCH 26/43] add FDivergenceType back

---
 tests/test_dpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index b151c4fa8b8..aefc6b5ba30 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -27,7 +27,7 @@
     AutoTokenizer,
 )
 
-from trl import DPOConfig, DPOTrainer
+from trl import DPOConfig, DPOTrainer, FDivergenceType
 
 from .testing_utils import require_bitsandbytes, require_no_wandb, require_peft
 

From f69bb1c37ebeea825ce20c5d1976de9937eaf185 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 12:43:00 +0000
Subject: [PATCH 27/43] precommit

---
 tests/my_new_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/my_new_test.py b/tests/my_new_test.py
index 24c31267d83..14eef6fe2c2 100644
--- a/tests/my_new_test.py
+++ b/tests/my_new_test.py
@@ -1,9 +1,10 @@
+import datasets
 import torch
 from datasets import Dataset
-from transformers import AutoModelForVision2Seq, AutoProcessor
 from PIL import Image
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
 from trl import DPOConfig, DPOTrainer
-import datasets
 
 
 # Get the model

From 6d859cf997d87cd4b3ed076bdf919695c04e42a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 12:56:31 +0000
Subject: [PATCH 28/43] pillow test dep

---
 setup.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 65ae9293d57..7180babb93d 100644
--- a/setup.py
+++ b/setup.py
@@ -69,7 +69,16 @@
     "tyro>=0.5.11",
 ]
 EXTRAS = {
-    "test": ["parameterized", "pytest", "pytest-xdist", "accelerate", "pytest-cov", "pytest-xdist", "scikit-learn"],
+    "test": [
+        "parameterized",
+        "pytest",
+        "pytest-xdist",
+        "accelerate",
+        "pytest-cov",
+        "pytest-xdist",
+        "scikit-learn",
+        "Pillow",
+    ],
     "peft": ["peft>=0.4.0"],
     "diffusers": ["diffusers>=0.18.0"],
     "deepspeed": ["deepspeed>=0.9.5"],

From 48db3e17a19a47b9397159600ec6ea644998c16d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 13:04:38 +0000
Subject: [PATCH 29/43] optional prompt

---
 examples/scripts/dpo.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
index b845187e295..27ebb8a4df1 100644
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@@ -174,7 +174,8 @@
             ds[key] = ds[key].select(range(50))
 
     def process(row):
-        row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
+        if "prompt" in row:
+            row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
         row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
         row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
         if "images" in row:

From dea765b6e012d602dcea252484a9e98ff2704ce9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 13:41:41 +0000
Subject: [PATCH 30/43] `evaluation_strategy` to `eval_strategy`

---
 tests/my_new_test.py      | 2 +-
 tests/test_dpo_trainer.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/my_new_test.py b/tests/my_new_test.py
index 14eef6fe2c2..5a6ae35d354 100644
--- a/tests/my_new_test.py
+++ b/tests/my_new_test.py
@@ -21,7 +21,7 @@
     remove_unused_columns=False,
     gradient_accumulation_steps=1,
     learning_rate=9e-1,
-    evaluation_strategy="steps",
+    eval_strategy="steps",
     beta=0.1,
     loss_type="sigmoid",
     precompute_ref_log_probs=True,
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index aefc6b5ba30..c531d5685ea 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -228,7 +228,7 @@ def test_vdpo_trainer(self, loss_type, pre_compute):
                 remove_unused_columns=False,
                 gradient_accumulation_steps=1,
                 learning_rate=9e-1,
-                evaluation_strategy="steps",
+                eval_strategy="steps",
                 beta=0.1,
                 loss_type=loss_type,
                 precompute_ref_log_probs=pre_compute,
@@ -855,7 +855,7 @@ def test_dpo_loss_alpha_div_f(self):
                 remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
-                evaluation_strategy="steps",
+                eval_strategy="steps",
                 f_divergence_type=FDivergenceType.ALPHA_DIVERGENCE.value,
                 f_alpha_divergence_coef=0.5,
             )
@@ -897,7 +897,7 @@ def test_dpo_loss_js_div_f(self):
                 remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
-                evaluation_strategy="steps",
+                eval_strategy="steps",
                 f_divergence_type=FDivergenceType.JS_DIVERGENCE.value,
                 f_alpha_divergence_coef=0.5,
             )

From d6dc3ba7e91f97f7a5e63c2293bf173e5efd9ce0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 13:43:58 +0000
Subject: [PATCH 31/43] revert vsft change (oos)

---
 examples/scripts/vsft_llava.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/scripts/vsft_llava.py b/examples/scripts/vsft_llava.py
index 59753927faf..85cb98d5f3c 100644
--- a/examples/scripts/vsft_llava.py
+++ b/examples/scripts/vsft_llava.py
@@ -18,9 +18,9 @@
     --dataset_name="HuggingFaceH4/llava-instruct-mix-vsft" \
     --model_name_or_path="llava-hf/llava-1.5-7b-hf" \
     --report_to="wandb" \
-    --learning_rate=1.4e-7 \
-    --per_device_train_batch_size=2 \
-    --gradient_accumulation_steps=32 \
+    --learning_rate=1.4e-5 \
+    --per_device_train_batch_size=8 \
+    --gradient_accumulation_steps=1 \
     --output_dir="data/vsft-llava-1.5-7b-hf" \
     --logging_steps=5 \
     --num_train_epochs=1 \
@@ -28,15 +28,16 @@
     --gradient_checkpointing \
     --remove_unused_columns=False \
     --torch_dtype=float16 \
+    --fp16=True
     
 # peft:
 python examples/scripts/vsft_llava.py \
-    --dataset_name="HuggingFaceH4/llava-instruct-mix" \
+    --dataset_name="HuggingFaceH4/llava-instruct-mix-vsft" \    
     --model_name_or_path="llava-hf/llava-1.5-7b-hf" \
     --report_to="wandb" \
     --learning_rate=1.4e-5 \
     --per_device_train_batch_size=8 \
-    --gradient_accumulation_steps=128 \
+    --gradient_accumulation_steps=1 \
     --output_dir="data/vsft-llava-1.5-7b-hf" \
     --logging_steps=5 \
     --num_train_epochs=1 \
@@ -44,10 +45,11 @@
     --gradient_checkpointing \
     --remove_unused_columns=False \
     --torch_dtype=float16 \
+    --fp16=True \ 
     --use_peft=True \
     --lora_r=64 \
     --lora_alpha=16 \
-    --lora_target_modules=all-linear
+    --lora_target_modules=all-linear"
 
 # evaluation:
  

From 3a1f5b8e1fd07e4882958df766be24222346d3c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 14:59:05 +0000
Subject: [PATCH 32/43] update test

---
 tests/test_dpo_trainer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index c531d5685ea..ff742eb5832 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -15,7 +15,7 @@
 import unittest
 
 import torch
-from datasets import Dataset
+from datasets import Dataset, features
 from parameterized import parameterized
 from PIL import Image
 from pytest import mark
@@ -142,7 +142,15 @@ def _init_dummy_image_dataset(self):
             ],
         }
         # fmt: on
-        return Dataset.from_dict(dummy_dataset_dict)
+        f = features.Features(
+            {
+                "images": features.Sequence(features.Image(decode=True)),  # datasets handles badly sequence of images
+                "prompt": features.Value("string"),
+                "chosen": features.Value("string"),
+                "rejected": features.Value("string"),
+            }
+        )
+        return Dataset.from_dict(dummy_dataset_dict, features=f)
 
     @parameterized.expand(
         [

From 5545825bc9f619f2c5719c167182fd317a0b593a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 25 Jun 2024 21:53:53 +0000
Subject: [PATCH 33/43] test

---
 tests/my_new_test.py       | 108 -------------------------------------
 tests/test_dpo_trainer.py  |  70 +++++++++++-------------
 trl/trainer/dpo_trainer.py |   3 +-
 3 files changed, 33 insertions(+), 148 deletions(-)
 delete mode 100644 tests/my_new_test.py

diff --git a/tests/my_new_test.py b/tests/my_new_test.py
deleted file mode 100644
index 5a6ae35d354..00000000000
--- a/tests/my_new_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import datasets
-import torch
-from datasets import Dataset
-from PIL import Image
-from transformers import AutoModelForVision2Seq, AutoProcessor
-
-from trl import DPOConfig, DPOTrainer
-
-
-# Get the model
-model_id = "trl-internal-testing/tiny-random-idefics2"
-model = AutoModelForVision2Seq.from_pretrained(model_id)
-ref_model = AutoModelForVision2Seq.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id)
-
-# Get the training args
-training_args = DPOConfig(
-    output_dir=".",
-    per_device_train_batch_size=2,
-    max_steps=3,
-    remove_unused_columns=False,
-    gradient_accumulation_steps=1,
-    learning_rate=9e-1,
-    eval_strategy="steps",
-    beta=0.1,
-    loss_type="sigmoid",
-    precompute_ref_log_probs=True,
-)
-
-dummy_dataset_dict = {
-    "images": [
-        [Image.new("RGB", (100, 100), color="black")],
-        [Image.new("RGB", (133, 100), color="red")],
-        [Image.new("RGB", (100, 133), color="green")],
-        [Image.new("RGB", (133, 133), color="blue")],
-        [Image.new("RGB", (200, 50), color="yellow")],
-        [Image.new("RGB", (50, 200), color="magenta")],
-        [Image.new("RGB", (200, 200), color="cyan")],
-        # [Image.new("RGB", (50, 50), color="white")],
-        # [Image.new("RGB", (100, 100), color="orange")],
-    ],
-    "prompt": [
-        "<image> hello",
-        "<image> how are you",
-        "<image> What is your name?",
-        "<image> What is your name?",
-        "<image> Which is the best programming language?",
-        "<image> Which is the best programming language?",
-        "<image> Which is the best programming language?",
-        # "[INST] How is the stock price? [/INST]",
-        # "[INST] How is the stock price? [/INST] ",
-    ],
-    "chosen": [
-        "hi nice to meet you",
-        "I am fine",
-        "My name is Mary",
-        "My name is Mary",
-        "Python",
-        "Python",
-        "Python",
-        # "$46 as of 10am EST",
-        # "46 as of 10am EST",
-    ],
-    "rejected": [
-        "leave me alone",
-        "I am not fine",
-        "Whats it to you?",
-        "I dont have a name",
-        "Javascript",
-        "C++",
-        "Java",
-        # " $46 as of 10am EST",
-        # " 46 as of 10am EST",
-    ],
-}
-
-features = datasets.Features(
-    {
-        "images": datasets.Sequence(datasets.Image(decode=True)),  # datasets still handles badly sequence of images
-        "prompt": datasets.Value("string"),
-        "chosen": datasets.Value("string"),
-        "rejected": datasets.Value("string"),
-    }
-)
-dataset = Dataset.from_dict(dummy_dataset_dict, features=features)
-
-
-trainer = DPOTrainer(
-    model=model,
-    ref_model=ref_model,
-    args=training_args,
-    tokenizer=processor,
-    train_dataset=dataset,
-    eval_dataset=dataset,
-)
-
-previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
-
-trainer.train()
-
-assert trainer.state.log_history[-1]["train_loss"] is not None
-
-# check the params have changed
-for n, param in previous_trainable_params.items():
-    new_param = trainer.model.get_parameter(n)
-    # check the params have changed - ignore 0 biases
-    if param.sum() != 0:
-        assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index ff742eb5832..12ede448c0d 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -97,48 +97,40 @@ def _init_dummy_image_dataset(self):
         # fmt: off
         dummy_dataset_dict = {
             "images": [
-                [Image.new("RGB", (100, 100), color="black")],
-                [Image.new("RGB", (133, 100), color="red")],
-                [Image.new("RGB", (100, 133), color="green")],
-                [Image.new("RGB", (133, 133), color="blue")],
-                [Image.new("RGB", (200, 50), color="yellow")],
-                [Image.new("RGB", (50, 200), color="magenta")],
-                [Image.new("RGB", (200, 200), color="cyan")],
-                [Image.new("RGB", (50, 50), color="white")],
-                [Image.new("RGB", (100, 100), color="orange")],
+                [Image.new("RGB", (100, 50), color="black")],
+                # None,
+                # [Image.new("RGB", (100, 100), color="blue"), Image.new("RGB", (150, 50), color="red")],
+                [Image.new("RGB", (200, 100), color="green")],
+                # [Image.new("RGB", (150, 150), color="yellow"), Image.new("RGB", (50, 150), color="purple")],
+                [Image.new("RGB", (80, 120), color="gray")],
+                [Image.new("RGB", (120, 80), color="pink")],
             ],
             "prompt": [
-                "hello",
-                "how are you",
-                "What is your name?",
-                "What is your name?",
-                "Which is the best programming language?",
-                "Which is the best programming language?",
-                "Which is the best programming language?",
-                "[INST] How is the stock price? [/INST]",
-                "[INST] How is the stock price? [/INST] ",
+                "<image> Hello",
+                # "How are you?",
+                # "<image><image> Let's chat",
+                "<image> Good morning",
+                # "<image><image> What's up?",
+                "Can you see this? <image>",
+                "Here is something interesting: <image>",
             ],
             "chosen": [
-                "hi nice to meet you",
-                "I am fine",
-                "My name is Mary",
-                "My name is Mary",
-                "Python",
-                "Python",
-                "Python",
-                "$46 as of 10am EST",
-                "46 as of 10am EST",
+                "Hi nice to meet you!",
+                # "I'm doing well, thank you!",
+                # "Sure, let's talk!",
+                "Good morning to you too!",
+                # "Not much, just working.",
+                "Yes, I can see it clearly.",
+                "That's quite interesting indeed.",
             ],
             "rejected": [
-                "leave me alone",
-                "I am not fine",
-                "Whats it to you?",
-                "I dont have a name",
-                "Javascript",
-                "C++",
-                "Java",
-                " $46 as of 10am EST",
-                " 46 as of 10am EST",
+                "Leave me alone!",
+                # "I'm not interested.",
+                # "I don't want to chat.",
+                "I'm still sleepy.",
+                # "Busy right now, talk later.",
+                "No, I can't see it.",
+                "I'm not sure what that is.",
             ],
         }
         # fmt: on
@@ -248,8 +240,6 @@ def test_vdpo_trainer(self, loss_type, pre_compute):
             ref_model = self.idefics2_ref_model
             processor = self.idefics2_processor
 
-            processor.pad_token_id = processor.tokenizer.pad_token_id
-
             trainer = DPOTrainer(
                 model=model,
                 ref_model=ref_model,
@@ -931,3 +921,7 @@ def test_dpo_loss_js_div_f(self):
                 policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
             )
             assert torch.isfinite(losses).cpu().numpy().all()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index d2fa213db0f..26161773dcf 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -746,7 +746,7 @@ def build_tokenized_answer(self, prompt, answer, images=None):
         answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
         answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
 
-        if self.is_vision_model:
+        if "pixel_values" in full_tokenized:
             return dict(
                 prompt_input_ids=prompt_input_ids,
                 prompt_attention_mask=prompt_attention_mask,
@@ -761,7 +761,6 @@ def build_tokenized_answer(self, prompt, answer, images=None):
                 prompt_attention_mask=prompt_attention_mask,
                 input_ids=answer_input_ids,
                 attention_mask=answer_attention_mask,
-                pixel_value=full_tokenized,
             )
 
     def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> Dict:

From 5197d6debf923bfaf6222f6d2a3e8c9014c4d7df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 08:33:47 +0000
Subject: [PATCH 34/43] comment and support more in process

---
 examples/scripts/dpo.py  | 19 +++++++++++++++++--
 examples/scripts/vdpo.py | 33 +++++++++++++++++++++++++--------
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
index 27ebb8a4df1..aede4ea609c 100644
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@@ -174,16 +174,31 @@
             ds[key] = ds[key].select(range(50))
 
     def process(row):
-        if "prompt" in row:
+        # The prompt can be either a string or a list. In some datasets, the prompt is just a common string
+        # for both rejected and chosen (already included in chosen and rejected) and is not meant to be used
+        # separately. In other datasets, the prompt is intended to be used as a prefix for rejected and chosen,
+        # and in such cases, it is properly formatted as a list with keys "role" and "content".
+        # Example 1:
+        # row = {"prompt": "What does detox mean?",
+        #        "chosen": [{"content": "What does detox mean?", "role": "user"}, {"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "What does detox mean?", "role": "assistant"}, {"content": "I don't know.", "role": "user"}]}
+        # Example 2:
+        # row = {"prompt": [{"content": "What does detox mean?", "role": "user"}],
+        #        "chosen": [{"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "I don't know.", "role": "user"}]}
+        if "prompt" in row and isinstance(row["prompt"], list):
             row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
+
         row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
         row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
+
         if "images" in row:
-            for idx, img in enumerate(row["images"]):  # Resize image so that the largest side is 640
+            for idx, img in enumerate(row["images"]):  # Resize each image so the largest side is 640 pixels
                 ratio = min(1.0, 640 / max(img.size))
                 new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
                 row["images"][idx] = img.resize(new_size)
             row["images"] = row["images"]
+
         return row
 
     with PartialState().local_main_process_first():
diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index 2d3fa4d3eba..264fb727375 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -133,14 +133,31 @@
             ds[key] = ds[key].select(range(50))
 
     def process(row):
-        row["prompt"] = processor.apply_chat_template(row["prompt"], tokenize=False)
-        row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
-        row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
-        for idx, img in enumerate(row["images"]):  # Resize image so that the largest side is 640
-            ratio = min(1.0, 640 / max(img.size))
-            new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
-            row["images"][idx] = img.resize(new_size)
-        row["images"] = row["images"]
+        # The prompt can be either a string or a list. In some datasets, the prompt is just a common string
+        # for both rejected and chosen (already included in chosen and rejected) and is not meant to be used
+        # separately. In other datasets, the prompt is intended to be used as a prefix for rejected and chosen,
+        # and in such cases, it is properly formatted as a list with keys "role" and "content".
+        # Example 1:
+        # row = {"prompt": "What does detox mean?",
+        #        "chosen": [{"content": "What does detox mean?", "role": "user"}, {"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "What does detox mean?", "role": "assistant"}, {"content": "I don't know.", "role": "user"}]}
+        # Example 2:
+        # row = {"prompt": [{"content": "What does detox mean?", "role": "user"}],
+        #        "chosen": [{"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "I don't know.", "role": "user"}]}
+        if "prompt" in row and isinstance(row["prompt"], list):
+            row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
+
+        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
+
+        if "images" in row:
+            for idx, img in enumerate(row["images"]):  # Resize each image so the largest side is 640 pixels
+                ratio = min(1.0, 640 / max(img.size))
+                new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+                row["images"][idx] = img.resize(new_size)
+            row["images"] = row["images"]
+
         return row
 
     with PartialState().local_main_process_first():

From 45fda7e4d5eef14a2fd39ec2eafcc6fbe8e4f756 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 10:03:07 +0000
Subject: [PATCH 35/43] update process

---
 examples/scripts/dpo.py  | 15 ++++++++++-----
 examples/scripts/vdpo.py |  6 +++---
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
index aede4ea609c..8a91ee7d31d 100644
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@@ -181,16 +181,21 @@ def process(row):
         # Example 1:
         # row = {"prompt": "What does detox mean?",
         #        "chosen": [{"content": "What does detox mean?", "role": "user"}, {"content": "It means to get rid of the toxins.", "role": "assistant"}],
-        #        "rejected": [{"content": "What does detox mean?", "role": "assistant"}, {"content": "I don't know.", "role": "user"}]}
+        #        "rejected": [{"content": "What does detox mean?", "role": "assistant"}, {"content": "I don't know.", "role": "assistant"}]}
         # Example 2:
         # row = {"prompt": [{"content": "What does detox mean?", "role": "user"}],
         #        "chosen": [{"content": "It means to get rid of the toxins.", "role": "assistant"}],
-        #        "rejected": [{"content": "I don't know.", "role": "user"}]}
+        #        "rejected": [{"content": "I don't know.", "role": "assistant"}]}
+        if is_vision_model:
+            apply_chat_template = processor.apply_chat_template
+        else:
+            apply_chat_template = tokenizer.apply_chat_template
+
         if "prompt" in row and isinstance(row["prompt"], list):
-            row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
+            row["prompt"] = apply_chat_template(row["prompt"], tokenize=False)
 
-        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
-        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
+        row["chosen"] = apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = apply_chat_template(row["rejected"], tokenize=False)
 
         if "images" in row:
             for idx, img in enumerate(row["images"]):  # Resize each image so the largest side is 640 pixels
diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index 264fb727375..bb3547c5882 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -146,10 +146,10 @@ def process(row):
         #        "chosen": [{"content": "It means to get rid of the toxins.", "role": "assistant"}],
         #        "rejected": [{"content": "I don't know.", "role": "user"}]}
         if "prompt" in row and isinstance(row["prompt"], list):
-            row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
+            row["prompt"] = processor.apply_chat_template(row["prompt"], tokenize=False)
 
-        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
-        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
+        row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
 
         if "images" in row:
             for idx, img in enumerate(row["images"]):  # Resize each image so the largest side is 640 pixels

From 5a1dfa73f72430a9b45615bab7eebe9038e4134d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 10:03:42 +0000
Subject: [PATCH 36/43] update doc for vdpo

---
 docs/source/dpo_trainer.mdx | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
index 4b95fa3552b..8da953a4bc4 100644
--- a/docs/source/dpo_trainer.mdx
+++ b/docs/source/dpo_trainer.mdx
@@ -70,8 +70,31 @@ dpo_dataset_dict = {
 
 where the `prompt` contains the context inputs, `chosen` contains the corresponding chosen responses and `rejected` contains the corresponding negative (rejected) responses. As can be seen a prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays.
 
+`DPOTrainer` can be used to fine-tune visual language models (VLMs). In this case, the dataset must also contain the key `images`. For example, for Idefics2, the processor expects the dataset to have the following format:
+
+```py
+dpo_dataset_dict = {
+    'images': [
+        [Image.open('beach.jpg')],
+        [Image.open('street.jpg')],
+    ],
+    'prompt': [
+        'The image <image> shows',
+        '<image> The image depicts',
+    ],
+    'chosen': [
+        'a sunny beach with palm trees.',
+        'a busy street with several cars and buildings.',
+    ],
+    'rejected': [
+        'a snowy mountain with skiers.',
+        'a calm countryside with green fields.',
+    ],
+}
+```
+
 ## Expected model format
-The DPO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
+The DPO trainer expects a model of `AutoModelForCausalLM` or `AutoModelForVision2Seq`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
 
 ## Using the `DPOTrainer`
 
@@ -86,7 +109,7 @@ dpo_trainer = DPOTrainer(
     model_ref,
     args=training_args,
     train_dataset=train_dataset,
-    tokenizer=tokenizer,
+    tokenizer=tokenizer,  # for visual language models, use tokenizer=processor instead
 )
 ```
 After this one can then call:

From 2c10ca870307d77bd99bbc4a5a0c712eab96eab1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 10:11:26 +0000
Subject: [PATCH 37/43] caution about limited support

---
 docs/source/dpo_trainer.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
index 8da953a4bc4..614053316f6 100644
--- a/docs/source/dpo_trainer.mdx
+++ b/docs/source/dpo_trainer.mdx
@@ -72,6 +72,8 @@ where the `prompt` contains the context inputs, `chosen` contains the correspond
 
 `DPOTrainer` can be used to fine-tune visual language models (VLMs). In this case, the dataset must also contain the key `images`. For example, for Idefics2, the processor expects the dataset to have the following format:
 
+Note: Currently, VLM support is exclusive to Idefics2 and does not extend to other VLMs.
+
 ```py
 dpo_dataset_dict = {
     'images': [

From 2e476334a6decefe1952a2baa740e65a8b38c759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Wed, 26 Jun 2024 15:04:22 +0200
Subject: [PATCH 38/43] Update docs/source/dpo_trainer.mdx

Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
---
 docs/source/dpo_trainer.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
index 614053316f6..4173eed5a65 100644
--- a/docs/source/dpo_trainer.mdx
+++ b/docs/source/dpo_trainer.mdx
@@ -70,7 +70,7 @@ dpo_dataset_dict = {
 
 where the `prompt` contains the context inputs, `chosen` contains the corresponding chosen responses and `rejected` contains the corresponding negative (rejected) responses. As can be seen a prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays.
 
-`DPOTrainer` can be used to fine-tune visual language models (VLMs). In this case, the dataset must also contain the key `images`. For example, for Idefics2, the processor expects the dataset to have the following format:
+`DPOTrainer` can be used to fine-tune visual language models (VLMs). In this case, the dataset must also contain the key `images`, and the trainer's `tokenizer` is the VLM's `processor`. For example, for Idefics2, the processor expects the dataset to have the following format:
 
 Note: Currently, VLM support is exclusive to Idefics2 and does not extend to other VLMs.
 

From f960a2a4603edde36c21f6133a01f347c52c2c6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 13:32:52 +0000
Subject: [PATCH 39/43] revert DPO example changes

---
 examples/scripts/dpo.py | 76 +++++++----------------------------------
 1 file changed, 13 insertions(+), 63 deletions(-)

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
index 8a91ee7d31d..56a11d9dc3e 100644
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@@ -48,19 +48,6 @@
     --use_peft \
     --lora_r=16 \
     --lora_alpha=16
-
-# vision with peft:
-accelerate launch examples/scripts/dpo.py \
-    --dataset_name HuggingFaceH4/rlaif-v_formatted \
-    --model_name_or_path HuggingFaceM4/idefics2-8b \
-    --output_dir dpo_idefics_rlaif-v \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --learning_rate 1e-5 \
-    --bf16 \
-    --torch_dtype bfloat16 \
-    --use_peft \
-    --lora_target_modules=all-linear
 """
 
 import logging
@@ -71,7 +58,6 @@
 TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
 
 from trl.commands.cli_utils import DPOScriptArguments, init_zero_verbose, TrlParser
-from accelerate import PartialState
 
 if TRL_USE_RICH:
     init_zero_verbose()
@@ -82,7 +68,7 @@
 
 import torch
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from trl import (
     DPOConfig,
@@ -126,25 +112,13 @@
         device_map=get_kbit_device_map() if quantization_config is not None else None,
         quantization_config=quantization_config,
     )
-    is_vision_model = model_config.model_name_or_path in ["HuggingFaceM4/idefics2-8b"]
-    if is_vision_model:
-        model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
-    else:
-        model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     peft_config = get_peft_config(model_config)
     if peft_config is None:
-        if is_vision_model:
-            model_ref = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
-        else:
-            model_ref = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+        model_ref = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     else:
         model_ref = None
-    if is_vision_model:
-        processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=True)
-        tokenizer = processor.tokenizer
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
-    processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
+    tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     if tokenizer.chat_template is None:
@@ -174,40 +148,16 @@
             ds[key] = ds[key].select(range(50))
 
     def process(row):
-        # The prompt can be either a string or a list. In some datasets, the prompt is just a common string
-        # for both rejected and chosen (already included in chosen and rejected) and is not meant to be used
-        # separately. In other datasets, the prompt is intended to be used as a prefix for rejected and chosen,
-        # and in such cases, it is properly formatted as a list with keys "role" and "content".
-        # Example 1:
-        # row = {"prompt": "What does detox mean?",
-        #        "chosen": [{"content": "What does detox mean?", "role": "user"}, {"content": "It means to get rid of the toxins.", "role": "assistant"}],
-        #        "rejected": [{"content": "What does detox mean?", "role": "assistant"}, {"content": "I don't know.", "role": "assistant"}]}
-        # Example 2:
-        # row = {"prompt": [{"content": "What does detox mean?", "role": "user"}],
-        #        "chosen": [{"content": "It means to get rid of the toxins.", "role": "assistant"}],
-        #        "rejected": [{"content": "I don't know.", "role": "assistant"}]}
-        if is_vision_model:
-            apply_chat_template = processor.apply_chat_template
-        else:
-            apply_chat_template = tokenizer.apply_chat_template
-
-        if "prompt" in row and isinstance(row["prompt"], list):
-            row["prompt"] = apply_chat_template(row["prompt"], tokenize=False)
-
-        row["chosen"] = apply_chat_template(row["chosen"], tokenize=False)
-        row["rejected"] = apply_chat_template(row["rejected"], tokenize=False)
-
-        if "images" in row:
-            for idx, img in enumerate(row["images"]):  # Resize each image so the largest side is 640 pixels
-                ratio = min(1.0, 640 / max(img.size))
-                new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
-                row["images"][idx] = img.resize(new_size)
-            row["images"] = row["images"]
-
+        row["prompt"] = tokenizer.apply_chat_template(row["chosen"][:-1], tokenize=False)
+        row["chosen"] = tokenizer.apply_chat_template([row["chosen"][-1]], tokenize=False)
+        row["rejected"] = tokenizer.apply_chat_template([row["rejected"][-1]], tokenize=False)
         return row
 
-    with PartialState().local_main_process_first():
-        ds = ds.map(process, num_proc=multiprocessing.cpu_count())
+    ds = ds.map(
+        process,
+        num_proc=multiprocessing.cpu_count(),
+        load_from_cache_file=False,
+    )
     train_dataset = ds[args.dataset_train_split]
     eval_dataset = ds[args.dataset_test_split]
 
@@ -221,7 +171,7 @@ def process(row):
             args=training_args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            tokenizer=processor if is_vision_model else tokenizer,
+            tokenizer=tokenizer,
             peft_config=get_peft_config(model_config),
             callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
         )

From e4c743616a3eb188a886ffe387eea29a4d56bee8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 13:33:29 +0000
Subject: [PATCH 40/43] cleaner way to check if a model is vision

---
 trl/trainer/dpo_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 26161773dcf..c579e77743e 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -37,6 +37,7 @@
     PreTrainedTokenizerBase,
     Trainer,
 )
+from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import EvalLoopOutput
 
@@ -317,7 +318,7 @@ def make_inputs_require_grad(module, input, output):
             self.is_encoder_decoder = args.is_encoder_decoder
 
         if model is not None:
-            self.is_vision_model = model.config.model_type in ["idefics2"]  # TODO: find a better way
+            self.is_vision_model = model.config.model_type in MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.keys()
         else:
             warnings.warn(
                 "No model provided, cannot determine if it is a vision model. Setting is_vision_model to False."

From bfb35d347935cbb4a1801479d28fb9de6b39e656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 14:03:49 +0000
Subject: [PATCH 41/43] comment

---
 trl/trainer/dpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index c579e77743e..b4775b165e8 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -510,7 +510,7 @@ def make_inputs_require_grad(module, input, output):
         # Compute that only on the main process for faster data processing.
         # see: https://github.com/huggingface/trl/pull/1255
         with PartialState().local_main_process_first():
-            # tokenize the dataset
+            # tokenize the dataset, lower writer batch size to avoid OOM (frequent in vision models)
             train_dataset = train_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc, writer_batch_size=10)
             if eval_dataset is not None:
                 eval_dataset = eval_dataset.map(

From 7b22153f2175b2a8b7ab515a47bf74bb02694d58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 14:06:13 +0000
Subject: [PATCH 42/43] update vdpo example

---
 examples/scripts/vdpo.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
index bb3547c5882..b602c8f0098 100644
--- a/examples/scripts/vdpo.py
+++ b/examples/scripts/vdpo.py
@@ -18,22 +18,15 @@
     --model_name_or_path HuggingFaceM4/idefics2-8b \
     --per_device_train_batch_size 1 \
     --gradient_accumulation_steps 16 \
-    --learning_rate 1e-5 \
-    --logging_steps 5 \
+    --dataset_num_proc 32 \
     --output_dir dpo_idefics_rlaif-v \
-    --push_to_hub --hub_model_id HuggingFaceH4/idefics2-8b-dpo-rlaif-v \
     --bf16 \
     --torch_dtype bfloat16 \
-    --logging_first_step \
-    --no_remove_unused_columns \
-    --dataset_num_proc 50 \
-    --dataload_num_workers 16 \
     --use_peft \
     --lora_target_modules=all-linear
 """
 
 import logging
-import multiprocessing
 import os
 from contextlib import nullcontext
 
@@ -106,8 +99,6 @@
     tokenizer = processor.tokenizer
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = "{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{{ eos_token }}"
     if args.ignore_bias_buffers:
         # torch distributed hack
         model._ddp_params_and_buffers_to_ignore = [
@@ -161,7 +152,7 @@ def process(row):
         return row
 
     with PartialState().local_main_process_first():
-        ds = ds.map(process, num_proc=multiprocessing.cpu_count())
+        ds = ds.map(process, num_proc=training_args.dataset_num_proc)
     train_dataset = ds[args.dataset_train_split]
     eval_dataset = ds[args.dataset_test_split]
 

From 515519491e150ae3f6b030d3f269b18789c0b26f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 26 Jun 2024 14:06:34 +0000
Subject: [PATCH 43/43] rename

---
 examples/scripts/{vdpo.py => dpo_visual.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/scripts/{vdpo.py => dpo_visual.py} (100%)

diff --git a/examples/scripts/vdpo.py b/examples/scripts/dpo_visual.py
similarity index 100%
rename from examples/scripts/vdpo.py
rename to examples/scripts/dpo_visual.py