From 5c0b240f5759110a850490a63aa3b2e7fb0cf708 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Mon, 21 Oct 2024 14:20:56 +0000 Subject: [PATCH 1/4] wip vlm2vec model --- mteb/models/vlm2vec_models.py | 244 ++++++++++++++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 mteb/models/vlm2vec_models.py diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py new file mode 100644 index 0000000000..6146a70100 --- /dev/null +++ b/mteb/models/vlm2vec_models.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +import logging +from functools import partial +from typing import Any, Literal + +import numpy as np +import torch +from peft import LoraConfig, PeftModel +from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor + +from PIL import Image +from torch.utils.data import DataLoader +from tqdm import tqdm +from mteb.model_meta import ModelMeta +from mteb.models.text_formatting_utils import corpus_to_texts + +from .instructions import task_to_instruction + +logging.basicConfig(level=logging.WARNING) +logger = logging.getLogger(__name__) + +EncodeTypes = Literal["query", "passage"] + + +def llm2vec_instruction(instruction): + if len(instruction) > 0 and instruction[-1] != ":": + instruction = instruction.strip(".") + ":" + return instruction + + +class VLM2VecWrapper: + """Adapted from https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/src/model.py""" + def __init__(self, model_name: str = "TIGER-Lab/VLM2Vec-LoRA", device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs): + try: + import flash_attn # noqa + except ImportError: + logger.warning( + "VLM2Vec models were trained with flash attention enabled. For optimal performance, please install the `flash_attn` package with `pip install flash-attn --no-build-isolation`." + ) + + self.pooling = "last" + self.normalize = True + self.temperature = 1.0 + self.hidden_size = 4096 + self.device = device + + # Loading the base model + base_model_name = "microsoft/Phi-3.5-vision-instruct" + config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True) + config.use_cache = False + config.padding_side = "right" + + checkpoint_path = model_name if model_name else base_model_name + base_model = AutoModelForCausalLM.from_pretrained( + checkpoint_path, + config=config, + attn_implementation="flash_attention_2", + torch_dtype=torch.bfloat16, + trust_remote_code=True, + ) + base_model.padding_side = "right" + + # Building the model on top of the base + if "LoRA" in model_name: + lora_config = LoraConfig.from_pretrained(checkpoint_path) + lora_model = PeftModel.from_pretrained( + base_model, checkpoint_path, config=lora_config + ) + lora_model = lora_model.merge_and_unload() + model = lora_model + else: + model = base_model + + model.eval() + self.mdl = model + + self.processor = AutoProcessor.from_pretrained( + base_model_name, + trust_remote_code=True, + num_crops=4, + ) + + def to(self, device: torch.device) -> None: + self.mdl.to(device, dtype=torch.bfloat16) + + def encode( + self, + sentences: list[str], + *, + prompt_name: str = None, + **kwargs: Any, # noqa + ) -> np.ndarray: + if prompt_name is not None: + instruction = ( + self.task_to_instructions[prompt_name] + if self.task_to_instructions + and prompt_name in self.task_to_instructions + else llm2vec_instruction(task_to_instruction(prompt_name)) + ) + else: + instruction = "" + + sentences = [[instruction, sentence] for sentence in sentences] + return self.model.encode(sentences, **kwargs) + + def encode_corpus( + self, + corpus: list[dict[str, str]] | dict[str, list[str]] | list[str], + prompt_name: str = None, + **kwargs: Any, + ) -> np.ndarray: + sentences = corpus_to_texts(corpus, sep=" ") + sentences = [["", sentence] for sentence in sentences] + if "request_qid" in kwargs: + kwargs.pop("request_qid") + return self.model.encode(sentences, **kwargs) + + def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: + return self.encode(queries, **kwargs) + + def encode_input(self, input): + hidden_states = self.encoder(**input, return_dict=True, output_hidden_states=True) + hidden_states = hidden_states.hidden_states[-1] + pooled_output = self._pooling(hidden_states, input['attention_mask']) + return pooled_output + + def _pooling(self, last_hidden_state, attention_mask): + if self.pooling == 'last': + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + reps = last_hidden_state[ + torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths] + else: + raise NotImplementedError + if self.normalize: + reps = torch.nn.functional.normalize(reps, p=2, dim=-1) + return reps + + + def get_image_embeddings( + self, images: list[Image.Image] | DataLoader, batch_size: int = 32 + ): + all_image_embeddings = [] + if isinstance(images, DataLoader): + with torch.no_grad(): + for batch in tqdm(images): + inputs = self.processor( + images=batch, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.get_image_features(**inputs) + all_image_embeddings.append(image_outputs.cpu()) + else: + with torch.no_grad(): + for i in tqdm(range(0, len(images), batch_size)): + batch_images = images[i : i + batch_size] + inputs = self.processor( + images=batch_images, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.get_image_features(**inputs) + all_image_embeddings.append(image_outputs.cpu()) + + all_image_embeddings = torch.cat(all_image_embeddings, dim=0) + return all_image_embeddings + + + def get_text_embeddings(self, texts: list[str], batch_size: int = 32): + all_text_embeddings = [] + + with torch.no_grad(): + for i in tqdm(range(0, len(texts), batch_size)): + batch_texts = texts[i : i + batch_size] + inputs = self.processor( + text=batch_texts, return_tensors="pt", padding=True, truncation=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + text_outputs = self.encode_input(**inputs) + all_text_embeddings.append(text_outputs.cpu()) + + all_text_embeddings = torch.cat(all_text_embeddings, dim=0) + return all_text_embeddings + + + def get_fused_embeddings( + self, + texts: list[str] = None, + images: list[Image.Image] | DataLoader = None, + fusion_mode="sum", + batch_size: int = 32, + ): + if texts is None and images is None: + raise ValueError("Either texts or images must be provided") + + text_embeddings = None + image_embeddings = None + + if texts is not None: + text_embeddings = self.get_text_embeddings(texts, batch_size) + + if images is not None: + image_embeddings = self.get_image_embeddings(images, batch_size) + + if text_embeddings is not None and image_embeddings is not None: + if len(text_embeddings) != len(image_embeddings): + raise ValueError( + "The number of texts and images must have the same length" + ) + if fusion_mode == "sum": + fused_embeddings = text_embeddings + image_embeddings + else: + # to do: add other fusion mode + raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") + return fused_embeddings + elif text_embeddings is not None: + return text_embeddings + elif image_embeddings is not None: + return image_embeddings + + +vlm2vec_lora = ModelMeta( + loader=partial( + VLM2VecWrapper, + model_name="TIGER-Lab/VLM2Vec-LoRA", + ), + name="TIGER-Lab/VLM2Vec-LoRA", + languages=["eng_Latn"], + open_source=True, + revision="7403b6327958071c1e33c822c7453adadccc7298", + release_date="2024-10-08", +) + +vlm2vec_full = ModelMeta( + loader=partial( + VLM2VecWrapper, + model_name="TIGER-Lab/VLM2Vec-Full", + ), + name="TIGER-Lab/VLM2Vec-Full", + languages=["eng_Latn"], + open_source=True, + revision="e9afa98002097ac2471827ba23ea1f2ddd229480", + release_date="2024-10-08", +) From 987af29e4d70f3fb162b825a15dcb5f32f90e81e Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 24 Oct 2024 14:31:13 +0000 Subject: [PATCH 2/4] making i2t classification work wit Calteh101 --- .../Image/ClassificationEvaluator.py | 4 + mteb/models/__init__.py | 3 +- mteb/models/vlm2vec_models.py | 87 ++++++++++--------- .../Caltech101.json | 28 ++++++ .../model_meta.json | 1 + 5 files changed, 83 insertions(+), 40 deletions(-) create mode 100644 results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/Caltech101.json create mode 100644 results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/model_meta.json diff --git a/mteb/evaluation/evaluators/Image/ClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ClassificationEvaluator.py index eaa6416dc7..e129ab0a4c 100644 --- a/mteb/evaluation/evaluators/Image/ClassificationEvaluator.py +++ b/mteb/evaluation/evaluators/Image/ClassificationEvaluator.py @@ -381,6 +381,10 @@ def __call__(self, model, test_cache=None): else: X_test = test_cache logger.info("Fitting logistic regression classifier...") + if X_train.dtype == torch.bfloat16: + X_train = X_train.to(torch.float32) + if X_test.dtype == torch.bfloat16: + X_test = X_test.to(torch.float32) clf.fit(X_train, self.y_train) logger.info("Evaluating...") y_pred = clf.predict(X_test) diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index d8512a5a23..25c7eb8f44 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -34,6 +34,7 @@ salesforce_models, sentence_transformers_models, vista_models, + vlm2vec_models, voyage_models, ) @@ -160,7 +161,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe sentence_transformers_models, vista_models, voyage_models, - google_models, + vlm2vec_models, ] models = {} diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index 6146a70100..e336769313 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -73,6 +73,7 @@ def __init__(self, model_name: str = "TIGER-Lab/VLM2Vec-LoRA", device: str = "cu model = base_model model.eval() + model.to(device) self.mdl = model self.processor = AutoProcessor.from_pretrained( @@ -84,43 +85,29 @@ def __init__(self, model_name: str = "TIGER-Lab/VLM2Vec-LoRA", device: str = "cu def to(self, device: torch.device) -> None: self.mdl.to(device, dtype=torch.bfloat16) - def encode( - self, - sentences: list[str], - *, - prompt_name: str = None, - **kwargs: Any, # noqa - ) -> np.ndarray: - if prompt_name is not None: - instruction = ( - self.task_to_instructions[prompt_name] - if self.task_to_instructions - and prompt_name in self.task_to_instructions - else llm2vec_instruction(task_to_instruction(prompt_name)) - ) - else: - instruction = "" + # def encode( + # self, + # sentences: list[str], + # *, + # prompt_name: str = None, + # **kwargs: Any, # noqa + # ) -> np.ndarray: + # if prompt_name is not None: + # instruction = ( + # self.task_to_instructions[prompt_name] + # if self.task_to_instructions + # and prompt_name in self.task_to_instructions + # else llm2vec_instruction(task_to_instruction(prompt_name)) + # ) + # else: + # instruction = "" + + # sentences = [[instruction, sentence] for sentence in sentences] + # return self.model.encode(sentences, **kwargs) - sentences = [[instruction, sentence] for sentence in sentences] - return self.model.encode(sentences, **kwargs) - - def encode_corpus( - self, - corpus: list[dict[str, str]] | dict[str, list[str]] | list[str], - prompt_name: str = None, - **kwargs: Any, - ) -> np.ndarray: - sentences = corpus_to_texts(corpus, sep=" ") - sentences = [["", sentence] for sentence in sentences] - if "request_qid" in kwargs: - kwargs.pop("request_qid") - return self.model.encode(sentences, **kwargs) - - def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: - return self.encode(queries, **kwargs) def encode_input(self, input): - hidden_states = self.encoder(**input, return_dict=True, output_hidden_states=True) + hidden_states = self.mdl(**input, return_dict=True, output_hidden_states=True) hidden_states = hidden_states.hidden_states[-1] pooled_output = self._pooling(hidden_states, input['attention_mask']) return pooled_output @@ -138,19 +125,41 @@ def _pooling(self, last_hidden_state, attention_mask): return reps + # reference: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/src/collator.py def get_image_embeddings( self, images: list[Image.Image] | DataLoader, batch_size: int = 32 ): + text="<|image_1|> Represent the given image." all_image_embeddings = [] if isinstance(images, DataLoader): + import torchvision.transforms.functional as F with torch.no_grad(): for batch in tqdm(images): - inputs = self.processor( - images=batch, return_tensors="pt", padding=True - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - image_outputs = self.model.get_image_features(**inputs) + input_ids, pixel_values, image_sizes = [], [], [] + for b in batch: + inputs = self.processor(text, [F.to_pil_image(b.to("cpu"))], return_tensors="pt", max_length=256, truncation=True) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1)) + pixel_values.append(inputs['pixel_values']) + image_sizes.append(inputs['image_sizes']) + + input_ids = torch._C._nn.pad_sequence( + input_ids, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id + ).squeeze(2) + attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id) + + pixel_values = torch.cat(pixel_values, dim=0) + image_sizes = torch.cat(image_sizes, dim=0) + inputs = { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'pixel_values': pixel_values, + 'image_sizes': image_sizes, + } + + image_outputs = self.encode_input(inputs) all_image_embeddings.append(image_outputs.cpu()) + else: with torch.no_grad(): for i in tqdm(range(0, len(images), batch_size)): diff --git a/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/Caltech101.json b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/Caltech101.json new file mode 100644 index 0000000000..cac75612d1 --- /dev/null +++ b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/Caltech101.json @@ -0,0 +1,28 @@ +{ + "dataset_revision": "851374102055782c84f89b1b4e9d128a6568847b", + "evaluation_time": 1317.9743084907532, + "kg_co2_emissions": null, + "mteb_version": "1.14.21", + "scores": { + "test": [ + { + "accuracy": 0.9301446416831032, + "f1": 0.8863632422649081, + "f1_weighted": 0.9270094006117223, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.9301446416831032, + "scores_per_experiment": [ + { + "accuracy": 0.9301446416831032, + "f1": 0.8863632422649081, + "f1_weighted": 0.9270094006117223 + } + ] + } + ] + }, + "task_name": "Caltech101" +} \ No newline at end of file diff --git a/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/model_meta.json b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/model_meta.json new file mode 100644 index 0000000000..07f5788002 --- /dev/null +++ b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/model_meta.json @@ -0,0 +1 @@ +{"name": "TIGER-Lab/VLM2Vec-LoRA", "revision": "7403b6327958071c1e33c822c7453adadccc7298", "release_date": "2024-10-08", "languages": ["eng_Latn"], "n_parameters": null, "memory_usage": null, "max_tokens": null, "embed_dim": null, "license": null, "open_source": true, "similarity_fn_name": null, "framework": [], "loader": "VLM2VecWrapper"} \ No newline at end of file From c1057974250743e2992f3475c406ec08d1b5246b Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 24 Oct 2024 20:50:53 +0000 Subject: [PATCH 3/4] test vlm2vec on other task types --- mteb/models/vlm2vec_models.py | 183 +++++++++++------- .../STS12.json | 26 +++ 2 files changed, 142 insertions(+), 67 deletions(-) create mode 100644 results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/STS12.json diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index e336769313..1adc006ce1 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -4,18 +4,14 @@ from functools import partial from typing import Any, Literal -import numpy as np import torch from peft import LoraConfig, PeftModel -from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor - from PIL import Image from torch.utils.data import DataLoader from tqdm import tqdm -from mteb.model_meta import ModelMeta -from mteb.models.text_formatting_utils import corpus_to_texts +from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor -from .instructions import task_to_instruction +from mteb.model_meta import ModelMeta logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) @@ -23,15 +19,15 @@ EncodeTypes = Literal["query", "passage"] -def llm2vec_instruction(instruction): - if len(instruction) > 0 and instruction[-1] != ":": - instruction = instruction.strip(".") + ":" - return instruction - - class VLM2VecWrapper: """Adapted from https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/src/model.py""" - def __init__(self, model_name: str = "TIGER-Lab/VLM2Vec-LoRA", device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs): + + def __init__( + self, + model_name: str = "TIGER-Lab/VLM2Vec-LoRA", + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs, + ): try: import flash_attn # noqa except ImportError: @@ -82,79 +78,74 @@ def __init__(self, model_name: str = "TIGER-Lab/VLM2Vec-LoRA", device: str = "cu num_crops=4, ) - def to(self, device: torch.device) -> None: - self.mdl.to(device, dtype=torch.bfloat16) - - # def encode( - # self, - # sentences: list[str], - # *, - # prompt_name: str = None, - # **kwargs: Any, # noqa - # ) -> np.ndarray: - # if prompt_name is not None: - # instruction = ( - # self.task_to_instructions[prompt_name] - # if self.task_to_instructions - # and prompt_name in self.task_to_instructions - # else llm2vec_instruction(task_to_instruction(prompt_name)) - # ) - # else: - # instruction = "" - - # sentences = [[instruction, sentence] for sentence in sentences] - # return self.model.encode(sentences, **kwargs) - + def encode( + self, + sentences: list[str], + *, + prompt_name: str = None, + **kwargs: Any, # noqa + ): + return self.get_text_embeddings(texts=sentences) def encode_input(self, input): hidden_states = self.mdl(**input, return_dict=True, output_hidden_states=True) hidden_states = hidden_states.hidden_states[-1] - pooled_output = self._pooling(hidden_states, input['attention_mask']) + pooled_output = self._pooling(hidden_states, input["attention_mask"]) return pooled_output - + def _pooling(self, last_hidden_state, attention_mask): - if self.pooling == 'last': + if self.pooling == "last": sequence_lengths = attention_mask.sum(dim=1) - 1 batch_size = last_hidden_state.shape[0] reps = last_hidden_state[ - torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths] + torch.arange(batch_size, device=last_hidden_state.device), + sequence_lengths, + ] else: raise NotImplementedError if self.normalize: reps = torch.nn.functional.normalize(reps, p=2, dim=-1) return reps - # reference: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/src/collator.py def get_image_embeddings( self, images: list[Image.Image] | DataLoader, batch_size: int = 32 ): - text="<|image_1|> Represent the given image." + text = "<|image_1|> Represent the given image." all_image_embeddings = [] if isinstance(images, DataLoader): import torchvision.transforms.functional as F + with torch.no_grad(): for batch in tqdm(images): input_ids, pixel_values, image_sizes = [], [], [] for b in batch: - inputs = self.processor(text, [F.to_pil_image(b.to("cpu"))], return_tensors="pt", max_length=256, truncation=True) + inputs = self.processor( + text, + [F.to_pil_image(b.to("cpu"))], + return_tensors="pt", + max_length=256, + truncation=True, + ) inputs = {k: v.to(self.device) for k, v in inputs.items()} input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1)) - pixel_values.append(inputs['pixel_values']) - image_sizes.append(inputs['image_sizes']) + pixel_values.append(inputs["pixel_values"]) + image_sizes.append(inputs["image_sizes"]) input_ids = torch._C._nn.pad_sequence( - input_ids, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id + input_ids, + batch_first=True, + padding_value=self.processor.tokenizer.pad_token_id, ).squeeze(2) attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id) pixel_values = torch.cat(pixel_values, dim=0) image_sizes = torch.cat(image_sizes, dim=0) inputs = { - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'pixel_values': pixel_values, - 'image_sizes': image_sizes, + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "image_sizes": image_sizes, } image_outputs = self.encode_input(inputs) @@ -164,34 +155,77 @@ def get_image_embeddings( with torch.no_grad(): for i in tqdm(range(0, len(images), batch_size)): batch_images = images[i : i + batch_size] - inputs = self.processor( - images=batch_images, return_tensors="pt", padding=True - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - image_outputs = self.model.get_image_features(**inputs) + input_ids, pixel_values, image_sizes = [], [], [] + for b in batch_images: + inputs = self.processor( + text, + [b], + return_tensors="pt", + max_length=256, + truncation=True, + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1)) + pixel_values.append(inputs["pixel_values"]) + image_sizes.append(inputs["image_sizes"]) + + input_ids = torch._C._nn.pad_sequence( + input_ids, + batch_first=True, + padding_value=self.processor.tokenizer.pad_token_id, + ).squeeze(2) + attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id) + + pixel_values = torch.cat(pixel_values, dim=0) + image_sizes = torch.cat(image_sizes, dim=0) + inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "image_sizes": image_sizes, + } + + image_outputs = self.encode_input(inputs) all_image_embeddings.append(image_outputs.cpu()) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) return all_image_embeddings - def get_text_embeddings(self, texts: list[str], batch_size: int = 32): all_text_embeddings = [] with torch.no_grad(): for i in tqdm(range(0, len(texts), batch_size)): + input_ids = [] batch_texts = texts[i : i + batch_size] - inputs = self.processor( - text=batch_texts, return_tensors="pt", padding=True, truncation=True - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - text_outputs = self.encode_input(**inputs) + for text in batch_texts: + inputs = self.processor( + text, + None, + return_tensors="pt", + max_length=256, + truncation=True, + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + input_ids.append(inputs["input_ids"].squeeze(0).unsqueeze(1)) + + input_ids = torch._C._nn.pad_sequence( + input_ids, + batch_first=True, + padding_value=self.processor.tokenizer.pad_token_id, + ).squeeze(2) + attention_mask = input_ids.ne(self.processor.tokenizer.pad_token_id) + inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + + text_outputs = self.encode_input(inputs) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) return all_text_embeddings - def get_fused_embeddings( self, texts: list[str] = None, @@ -216,11 +250,26 @@ def get_fused_embeddings( raise ValueError( "The number of texts and images must have the same length" ) - if fusion_mode == "sum": - fused_embeddings = text_embeddings + image_embeddings - else: - # to do: add other fusion mode - raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") + texts = iter(texts) + all_fused_embeddings = [] + if isinstance(images, DataLoader): + import torchvision.transforms.functional as F + + for batch in images: + for b in batch: + text = next(texts) + inputs = self.processor( + f"<|image_1|> Represent the given image with the following question: {text}", + [F.to_pil_image(b.to("cpu"))], + ) + inputs = { + key: value.to(self.device) for key, value in inputs.items() + } + outputs = self.encode_input(inputs) + all_fused_embeddings.append(outputs.cpu()) + + fused_embeddings = torch.cat(all_fused_embeddings, dim=0) + return fused_embeddings elif text_embeddings is not None: return text_embeddings diff --git a/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/STS12.json b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/STS12.json new file mode 100644 index 0000000000..2f7a702114 --- /dev/null +++ b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/STS12.json @@ -0,0 +1,26 @@ +{ + "dataset_revision": "a0d554a64d88156834ff5ae9920b964011b16384", + "evaluation_time": 33.679136514663696, + "kg_co2_emissions": null, + "mteb_version": "1.14.21", + "scores": { + "test": [ + { + "cosine_pearson": 0.6128856131150828, + "cosine_spearman": 0.5375376750091784, + "euclidean_pearson": 0.5866571133163221, + "euclidean_spearman": 0.5376001641683719, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.5375376750091784, + "manhattan_pearson": 0.5912422177023093, + "manhattan_spearman": 0.5413588869937086, + "pearson": 0.6128856131150828, + "spearman": 0.5375376750091784 + } + ] + }, + "task_name": "STS12" +} \ No newline at end of file From d89a0d19fa9c494c436a3ed09e795342180b7dfe Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 24 Oct 2024 21:01:54 +0000 Subject: [PATCH 4/4] move peft into class --- mteb/models/vlm2vec_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index 1adc006ce1..321cba24a4 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -5,7 +5,6 @@ from typing import Any, Literal import torch -from peft import LoraConfig, PeftModel from PIL import Image from torch.utils.data import DataLoader from tqdm import tqdm @@ -30,6 +29,7 @@ def __init__( ): try: import flash_attn # noqa + from peft import LoraConfig, PeftModel # noqa except ImportError: logger.warning( "VLM2Vec models were trained with flash attention enabled. For optimal performance, please install the `flash_attn` package with `pip install flash-attn --no-build-isolation`."