diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index d75236a93a..e9c8653f74 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -149,7 +149,7 @@ def get_image_embeddings( } image_outputs = self.encode_input(inputs) - all_image_embeddings.append(image_outputs.cpu()) + all_image_embeddings.append(image_outputs.cpu().to(torch.float32)) else: with torch.no_grad(): @@ -186,7 +186,7 @@ def get_image_embeddings( } image_outputs = self.encode_input(inputs) - all_image_embeddings.append(image_outputs.cpu()) + all_image_embeddings.append(image_outputs.cpu().to(torch.float32)) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) return all_image_embeddings @@ -221,7 +221,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): } text_outputs = self.encode_input(inputs) - all_text_embeddings.append(text_outputs.cpu()) + all_text_embeddings.append(text_outputs.cpu().to(torch.float32)) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) return all_text_embeddings @@ -239,39 +239,34 @@ def get_fused_embeddings( text_embeddings = None image_embeddings = None - if texts is not None: + if texts is not None and images is None: text_embeddings = self.get_text_embeddings(texts, batch_size) + return text_embeddings - if images is not None: + if images is not None and texts is None: image_embeddings = self.get_image_embeddings(images, batch_size) - - if text_embeddings is not None and image_embeddings is not None: - if len(text_embeddings) != len(image_embeddings): - raise ValueError( - "The number of texts and images must have the same length" - ) - texts = iter(texts) - all_fused_embeddings = [] - if isinstance(images, DataLoader): - import torchvision.transforms.functional as F - with torch.no_grad(): - for batch in images: - for b in batch: - text = next(texts) - inputs = self.processor( - f"<|image_1|> Represent the given image with the following question: {text}", - [F.to_pil_image(b.to("cpu"))], - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - outputs = self.encode_input(inputs) - all_fused_embeddings.append(outputs.cpu()) - fused_embeddings = torch.cat(all_fused_embeddings, dim=0) - return fused_embeddings - elif text_embeddings is not None: - return text_embeddings - elif image_embeddings is not None: return image_embeddings + # text_embeddings is not None and image_embeddings is not None + texts = iter(texts) + all_fused_embeddings = [] + if isinstance(images, DataLoader): + import torchvision.transforms.functional as F + + with torch.no_grad(): + for batch in images: + for b in batch: + text = next(texts) + inputs = self.processor( + f"<|image_1|> Represent the given image with the following question: {text}", + [F.to_pil_image(b.to("cpu"))], + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + outputs = self.encode_input(inputs) + all_fused_embeddings.append(outputs.cpu().to(torch.float32)) + fused_embeddings = torch.cat(all_fused_embeddings, dim=0) + return fused_embeddings + vlm2vec_lora = ModelMeta( loader=partial( diff --git a/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/BLINKIT2IRetrieval.json b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/BLINKIT2IRetrieval.json new file mode 100644 index 0000000000..9077b8c127 --- /dev/null +++ b/results-mieb/TIGER-Lab__VLM2Vec-LoRA/7403b6327958071c1e33c822c7453adadccc7298/BLINKIT2IRetrieval.json @@ -0,0 +1,186 @@ +{ + "dataset_revision": "359b66f11c25d19bc8f7108d98e660a5857f3d26", + "evaluation_time": 224.3045289516449, + "kg_co2_emissions": null, + "mteb_version": "1.14.21", + "scores": { + "test": [ + { + "cv_recall_at_1": 0.33582, + "cv_recall_at_10": 0.63184, + "cv_recall_at_100": 0.90547, + "cv_recall_at_1000": 1.0, + "cv_recall_at_20": 0.72637, + "cv_recall_at_3": 0.52736, + "cv_recall_at_5": 0.57711, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.48677, + "map_at_1": 0.33582, + "map_at_10": 0.43983, + "map_at_100": 0.45072, + "map_at_1000": 0.45136, + "map_at_20": 0.44629, + "map_at_3": 0.4204, + "map_at_5": 0.43209, + "mrr_at_1": 0.3358208955223881, + "mrr_at_10": 0.43982666034904844, + "mrr_at_100": 0.45071603631711843, + "mrr_at_1000": 0.4513620025552411, + "mrr_at_20": 0.44629389891710264, + "mrr_at_3": 0.42039800995024873, + "mrr_at_5": 0.43208955223880596, + "nauc_cv_recall_at_1000_diff1": NaN, + "nauc_cv_recall_at_1000_max": NaN, + "nauc_cv_recall_at_1000_std": NaN, + "nauc_cv_recall_at_100_diff1": 0.2698006811472229, + "nauc_cv_recall_at_100_max": -0.8185369176412147, + "nauc_cv_recall_at_100_std": -1.0075573331300645, + "nauc_cv_recall_at_10_diff1": 0.1788243841076317, + "nauc_cv_recall_at_10_max": -0.612746605676537, + "nauc_cv_recall_at_10_std": -0.9530034638964193, + "nauc_cv_recall_at_1_diff1": 0.2822466928213175, + "nauc_cv_recall_at_1_max": -0.33285460282946566, + "nauc_cv_recall_at_1_std": -0.5640630200327668, + "nauc_cv_recall_at_20_diff1": 0.12960665874064498, + "nauc_cv_recall_at_20_max": -0.8021930752909697, + "nauc_cv_recall_at_20_std": -1.071727841531356, + "nauc_cv_recall_at_3_diff1": 0.2019990705156492, + "nauc_cv_recall_at_3_max": -0.4459723288652778, + "nauc_cv_recall_at_3_std": -0.7870326458461854, + "nauc_cv_recall_at_5_diff1": 0.23846869773197804, + "nauc_cv_recall_at_5_max": -0.47064146280491637, + "nauc_cv_recall_at_5_std": -0.8505490027952863, + "nauc_map_at_1000_diff1": 0.2409455106402186, + "nauc_map_at_1000_max": -0.407384704974283, + "nauc_map_at_1000_std": -0.6813392812409054, + "nauc_map_at_100_diff1": 0.24104039617379056, + "nauc_map_at_100_max": -0.408250752001871, + "nauc_map_at_100_std": -0.682039826842636, + "nauc_map_at_10_diff1": 0.2411578991609476, + "nauc_map_at_10_max": -0.4044975322245294, + "nauc_map_at_10_std": -0.6840284218652672, + "nauc_map_at_1_diff1": 0.2822466928213175, + "nauc_map_at_1_max": -0.33285460282946566, + "nauc_map_at_1_std": -0.5640630200327668, + "nauc_map_at_20_diff1": 0.2388129109858802, + "nauc_map_at_20_max": -0.4111868495020421, + "nauc_map_at_20_std": -0.6861574124125431, + "nauc_map_at_3_diff1": 0.2415896795362189, + "nauc_map_at_3_max": -0.3816620658278678, + "nauc_map_at_3_std": -0.6597288106948174, + "nauc_map_at_5_diff1": 0.2490237529189825, + "nauc_map_at_5_max": -0.3874920448893905, + "nauc_map_at_5_std": -0.6729363831482362, + "nauc_mrr_at_1000_diff1": 0.2409455106402186, + "nauc_mrr_at_1000_max": -0.407384704974283, + "nauc_mrr_at_1000_std": -0.6813392812409054, + "nauc_mrr_at_100_diff1": 0.24104039617379056, + "nauc_mrr_at_100_max": -0.408250752001871, + "nauc_mrr_at_100_std": -0.682039826842636, + "nauc_mrr_at_10_diff1": 0.2411578991609476, + "nauc_mrr_at_10_max": -0.4044975322245294, + "nauc_mrr_at_10_std": -0.6840284218652672, + "nauc_mrr_at_1_diff1": 0.2822466928213175, + "nauc_mrr_at_1_max": -0.33285460282946566, + "nauc_mrr_at_1_std": -0.5640630200327668, + "nauc_mrr_at_20_diff1": 0.2388129109858802, + "nauc_mrr_at_20_max": -0.4111868495020421, + "nauc_mrr_at_20_std": -0.6861574124125431, + "nauc_mrr_at_3_diff1": 0.2415896795362189, + "nauc_mrr_at_3_max": -0.3816620658278678, + "nauc_mrr_at_3_std": -0.6597288106948174, + "nauc_mrr_at_5_diff1": 0.2490237529189825, + "nauc_mrr_at_5_max": -0.3874920448893905, + "nauc_mrr_at_5_std": -0.6729363831482362, + "nauc_ndcg_at_1000_diff1": 0.2344092745371185, + "nauc_ndcg_at_1000_max": -0.4328446999637412, + "nauc_ndcg_at_1000_std": -0.709474885782739, + "nauc_ndcg_at_100_diff1": 0.23614140799135372, + "nauc_ndcg_at_100_max": -0.45059659752764697, + "nauc_ndcg_at_100_std": -0.7232249956899242, + "nauc_ndcg_at_10_diff1": 0.22732780326872332, + "nauc_ndcg_at_10_max": -0.44870094654610376, + "nauc_ndcg_at_10_std": -0.7429856899210265, + "nauc_ndcg_at_1_diff1": 0.2822466928213175, + "nauc_ndcg_at_1_max": -0.33285460282946566, + "nauc_ndcg_at_1_std": -0.5640630200327668, + "nauc_ndcg_at_20_diff1": 0.21923921046650915, + "nauc_ndcg_at_20_max": -0.4772077987509219, + "nauc_ndcg_at_20_std": -0.7553659757754395, + "nauc_ndcg_at_3_diff1": 0.23134535418875965, + "nauc_ndcg_at_3_max": -0.39788809593745006, + "nauc_ndcg_at_3_std": -0.691825805701685, + "nauc_ndcg_at_5_diff1": 0.24552033909645363, + "nauc_ndcg_at_5_max": -0.4076854503791393, + "nauc_ndcg_at_5_std": -0.7156597141210952, + "nauc_precision_at_1000_diff1": NaN, + "nauc_precision_at_1000_max": NaN, + "nauc_precision_at_1000_std": NaN, + "nauc_precision_at_100_diff1": 0.2698006811472233, + "nauc_precision_at_100_max": -0.818536917641218, + "nauc_precision_at_100_std": -1.007557333130069, + "nauc_precision_at_10_diff1": 0.17882438410763166, + "nauc_precision_at_10_max": -0.6127466056765365, + "nauc_precision_at_10_std": -0.9530034638964187, + "nauc_precision_at_1_diff1": 0.2822466928213175, + "nauc_precision_at_1_max": -0.33285460282946566, + "nauc_precision_at_1_std": -0.5640630200327668, + "nauc_precision_at_20_diff1": 0.12960665874064572, + "nauc_precision_at_20_max": -0.8021930752909691, + "nauc_precision_at_20_std": -1.0717278415313547, + "nauc_precision_at_3_diff1": 0.20199907051564917, + "nauc_precision_at_3_max": -0.44597232886527743, + "nauc_precision_at_3_std": -0.7870326458461852, + "nauc_precision_at_5_diff1": 0.238468697731978, + "nauc_precision_at_5_max": -0.47064146280491626, + "nauc_precision_at_5_std": -0.8505490027952862, + "nauc_recall_at_1000_diff1": NaN, + "nauc_recall_at_1000_max": NaN, + "nauc_recall_at_1000_std": NaN, + "nauc_recall_at_100_diff1": 0.2698006811472229, + "nauc_recall_at_100_max": -0.8185369176412147, + "nauc_recall_at_100_std": -1.0075573331300645, + "nauc_recall_at_10_diff1": 0.1788243841076317, + "nauc_recall_at_10_max": -0.612746605676537, + "nauc_recall_at_10_std": -0.9530034638964193, + "nauc_recall_at_1_diff1": 0.2822466928213175, + "nauc_recall_at_1_max": -0.33285460282946566, + "nauc_recall_at_1_std": -0.5640630200327668, + "nauc_recall_at_20_diff1": 0.12960665874064498, + "nauc_recall_at_20_max": -0.8021930752909697, + "nauc_recall_at_20_std": -1.071727841531356, + "nauc_recall_at_3_diff1": 0.2019990705156492, + "nauc_recall_at_3_max": -0.4459723288652778, + "nauc_recall_at_3_std": -0.7870326458461854, + "nauc_recall_at_5_diff1": 0.23846869773197804, + "nauc_recall_at_5_max": -0.47064146280491637, + "nauc_recall_at_5_std": -0.8505490027952863, + "ndcg_at_1": 0.33582, + "ndcg_at_10": 0.48677, + "ndcg_at_100": 0.54341, + "ndcg_at_1000": 0.55646, + "ndcg_at_20": 0.51055, + "ndcg_at_3": 0.44788, + "ndcg_at_5": 0.46865, + "precision_at_1": 0.33582, + "precision_at_10": 0.06318, + "precision_at_100": 0.00905, + "precision_at_1000": 0.001, + "precision_at_20": 0.03632, + "precision_at_3": 0.17579, + "precision_at_5": 0.11542, + "recall_at_1": 0.33582, + "recall_at_10": 0.63184, + "recall_at_100": 0.90547, + "recall_at_1000": 1.0, + "recall_at_20": 0.72637, + "recall_at_3": 0.52736, + "recall_at_5": 0.57711 + } + ] + }, + "task_name": "BLINKIT2IRetrieval" +} \ No newline at end of file