diff --git a/mteb/abstasks/AbsTaskRTEB.py b/mteb/abstasks/AbsTaskRTEB.py new file mode 100644 index 0000000000..5406e894b9 --- /dev/null +++ b/mteb/abstasks/AbsTaskRTEB.py @@ -0,0 +1,1110 @@ +from __future__ import annotations + +import argparse +import json +import logging +import os +from collections import OrderedDict, defaultdict +from pathlib import Path +from typing import Any + +import numpy as np +import pytorch_lightning as pl +import torch +import torch.distributed as dist +from beir.retrieval.evaluation import EvaluateRetrieval +from beir.retrieval.search.dense.util import cos_sim, dot_score +from datasets import DatasetDict, Value, load_dataset +from pytorch_lightning import LightningModule +from torch.utils.data import DataLoader, Dataset + +from mteb.abstasks.TaskMetadata import HFSubset, TaskMetadata +from mteb.encoder_interface import Encoder +from mteb.load_results.task_results import ScoresDict + +from .AbsTask import AbsTask + +CORPUS_EMBD_FILENAME = "corpus_embds.jsonl" +QUERIES_EMBD_FILENAME = "queries_embds.jsonl" +RETRIEVE_EVAL_FILENAME = "retrieve_eval.json" +RETRIEVE_PRED_FILENAME = "retrieve_pred.json" + +logger = logging.getLogger(__name__) + + +# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca9910b1e0d218759d4/beir/datasets/data_loader_hf.py#L10 +class HFDataLoader: + def __init__( + self, + hf_repo: str | None = None, + streaming: bool = False, + keep_in_memory: bool = False, + trust_remote_code: bool = False, + token: str | None = None, + ): + self._loaded = False + self.corpus = {} + self.queries = {} + self.qrels = {} + self.hf_repo = hf_repo + self.hf_repo_qrels = hf_repo # Always use same repo + + self.streaming = streaming + self.keep_in_memory = keep_in_memory + self.trust_remote_code = trust_remote_code + + self.token = token or os.environ["HF_TOKEN"] + + def load( + self, split="test" + ) -> tuple[dict[str, dict[str, str]], dict[str, str], dict[str, dict[str, int]]]: + if not self._loaded: + logger.info("Loading Corpus...") + self._load_corpus() + logger.info("Loaded %d %s Documents.", len(self.corpus), split.upper()) + # logger.info("Doc Example: %s", self.corpus[0]) # Removed as self.corpus is now a Dataset + + logger.info("Loading Queries...") + self._load_queries() + + self._load_qrels(split) + self._loaded = True + + # filter queries with no qrels + qrels_dict = defaultdict(dict) + + def qrels_dict_init(row): + qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"]) + + # Check if qrels is a Dataset before mapping + if hasattr(self.qrels, "map"): + self.qrels.map(qrels_dict_init) + else: + # If not a Dataset, assume it's already a dict (e.g., from _load_qrels) + qrels_dict = self.qrels + + # Check if queries is a Dataset before filtering + if hasattr(self.queries, "filter"): + self.queries = self.queries.filter(lambda x: x["id"] in qrels_dict) + # logger.info("Loaded %d %s Queries.", len(self.queries), split.upper()) # Removed as self.queries is now a Dataset + # logger.info("Query Example: %s", self.queries[0]) # Removed as self.queries is now a Dataset + + return self.corpus, self.queries, qrels_dict # Return qrels_dict + + def _load_dataset(self, dataset_type: str): + """Helper to load and standardize datasets""" + ds = load_dataset( + self.hf_repo, + dataset_type, + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + trust_remote_code=self.trust_remote_code, + ) + ds = next(iter(ds.values())) # get first split + return ds.cast_column("id", Value("string")).remove_columns( + [col for col in ds.column_names if col not in ["id", "text"]] + ) + + def _load_corpus(self): + self.corpus = self._load_dataset("corpus") + + def _load_queries(self): + self.queries = self._load_dataset("queries") + + def _load_qrels(self, split): + qrels_ds = load_dataset( + self.hf_repo, + "default", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + trust_remote_code=self.trust_remote_code, + ) + self.qrels = qrels_ds + + +def gather_list(data: list, num_devices: int): + """Gather list data and merge them into a list.""" + if num_devices == 1: + return data + gathered = [None] * num_devices + dist.all_gather_object(gathered, data) + return sum(gathered, []) + + +def run_retrieve_evaluation(relevance, prediction): + if len(relevance) != len(prediction): + raise RuntimeError("Prediction and ground truth have different sizes.") + + ndcg, _map, recall, precision = EvaluateRetrieval.evaluate( + relevance, + prediction, + k_values=[1, 3, 5, 10, 20, 50, 100], + ignore_identical_ids=False, + ) + scores = { + **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, + **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, + **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, + **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, + } + return scores + + +class Retriever(LightningModule): + def __init__( + self, + topk: int = 100, + similarity: str = "cosine", + save_prediction: bool = False, + ): + super().__init__() + self.topk = topk + if similarity == "cosine": + self.similarity_fn = cos_sim + self.largest = True + elif similarity == "dot": + self.similarity_fn = dot_score + self.largest = True + elif similarity == "euclidean": + self.similarity_fn = torch.cdist + self.largest = False + else: + raise ValueError(f"similarity {similarity} is invalid.") + self.in_memory = True + self.save_file = None + self.save_prediction = save_prediction + + @property + def local_prediction_file_name(self): + assert self.save_file is not None + num_shards = self.trainer.num_devices + return f"{self.save_file}-{self.local_rank}-of-{num_shards}" + + def get_local_prediction_files(self, num_shards=None): + assert self.save_file is not None + if num_shards is None: + num_shards = self.trainer.num_devices + return [f"{self.save_file}-{i}-of-{num_shards}" for i in range(num_shards)] + + def on_predict_epoch_start(self): + self.local_prediction = {} + + def predict_step(self, batch, batch_idx): + query_ids, query_embds = batch["id"], batch["embd"].float() + if isinstance(query_ids, torch.Tensor): + # TODO: change dataloader to support int id + raise NotImplementedError("id must be a string.") + corpus_ids = [] + batch_scores = [] + # Compute the similarity in batches + for corpus_batch in self.corpus_embd_dataloader: + corpus_ids += corpus_batch["id"] + corpus_embds = corpus_batch["embd"].float().to(query_embds.device) + scores = self.similarity_fn(query_embds, corpus_embds).cpu() + batch_scores.append(scores) + # Concat the scores and compute top-k + scores = torch.cat(batch_scores, dim=1) + if not self.largest: + scores = scores * -1 + topk = min(self.topk, len(corpus_ids)) + topk_scores, topk_ids = torch.topk(scores, topk, dim=1, largest=True) + topk_scores, topk_ids = topk_scores.tolist(), topk_ids.tolist() + for i, qid in enumerate(query_ids): + result = OrderedDict() + for j in range(topk): + cid = corpus_ids[topk_ids[i][j]] + result[cid] = topk_scores[i][j] + self.local_prediction[qid] = result + + def on_predict_epoch_end(self): + if self.trainer.num_devices > 1: + if self.in_memory: + gathered_prediction = [None] * self.trainer.num_devices + dist.all_gather_object(gathered_prediction, self.local_prediction) + self.prediction = { + k: v for preds in gathered_prediction for k, v in preds.items() + } + else: + with open(self.local_prediction_file_name, "w") as f: + json.dump(self.local_prediction, f) + self.trainer.strategy.barrier() + self.prediction = {} + if self.trainer.is_global_zero: + for file in self.get_local_prediction_files(): + with open(file) as f: + self.prediction.update(json.load(f)) + else: + self.prediction = self.local_prediction + + if self.save_prediction and self.trainer.is_global_zero: + assert self.save_file is not None + with open(self.save_file, "w") as f: + json.dump(self.prediction, f) + + +class EmbeddingDataCollator: + def __call__(self, examples): + assert len(examples) > 0 + batch = { + key: [example[key] for example in examples] for key in examples[0].keys() + } + batch["embd"] = torch.tensor(batch["embd"]) + return batch + + +class EmptyDataset(Dataset): + def __init__(self, data, transform=None): + self.transform = transform + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + item = self.data[idx] + + # Optionally apply any transformations + if self.transform: + item = self.transform(item) + + return item + + +class JSONLDataset(Dataset): + def __init__(self, file_path, transform=None): + self.file_path = file_path + self.transform = transform + self.data = [] + + # Always convert to list for uniform processing + file_paths = [file_path] if isinstance(file_path, str) else file_path + + for path in file_paths: + with open(path) as f: + for line in f: + self.data.append(json.loads(line)) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + item = self.data[idx] + + # Optionally apply any transformations + if self.transform: + item = self.transform(item) + + return item + + +class MTEBToRTEBEncoderWrapper(LightningModule): + """Acts as a PyTorch Lightning Module to wrap an MTEB Encoder, + replicating the necessary functionality of RTEB's Encoder class + for use with trainer.predict, but overriding __setattr__ to prevent recursion. + """ + + @property + def load_embds(self) -> bool: + return self._load_embds + + @property + def save_embds(self) -> bool: + # If in_memory=False, we have to save the embeddings + return self._save_embds or not self.in_memory + + @property + def local_embd_file_name(self) -> str: + assert self.save_file is not None + num_shards = self.trainer.num_devices + return f"{self.save_file}-{self.local_rank}-of-{num_shards}" + + def get_local_embd_files(self, num_shards=None) -> list[str]: + # Return local (intermediate) file names, which are jsonl files + assert self.save_file is not None + if num_shards is None: + num_shards = self.trainer.num_devices + return [f"{self.save_file}-{i}-of-{num_shards}" for i in range(num_shards)] + + def get_embd_files(self, num_shards=None) -> list[str]: + # Return the final file names, which are arrow files + local_files = self.get_local_embd_files(num_shards=num_shards) + return local_files + + def embd_files_exist(self, num_shards=None) -> bool: + files = self.get_embd_files(num_shards=num_shards) + return all(os.path.exists(file) for file in files) + + def on_predict_epoch_start(self): + self.embds = None + + if self.in_memory: + self.local_embds = [] + + if self.load_embds: + self.local_existing_ids = set() + if os.path.exists(self.local_embd_file_name): + logger.warning(f"Load embeddings from {self.local_embd_file_name}") + ds = JSONLDataset(self.local_embd_file_name) + for example in ds: + self.local_existing_ids.add(example["id"]) + if self.in_memory: + self.local_embds.append(example) + else: + logger.warning( + f"load_embds is True but {self.local_embd_file_name} doesn't exist. Skipping the loading." + ) + + if self.save_embds: + if self.load_embds: + # append to the file + self.local_embd_file = open(self.local_embd_file_name, "a") + else: + # rewrite the file + self.local_embd_file = open(self.local_embd_file_name, "w") + + def on_predict_epoch_end(self): + if self.save_embds: + self.local_embd_file.close() + if self.in_memory: + self.embds = gather_list(self.local_embds, self.trainer.num_devices) + self.trainer.strategy.barrier() + + def __init__( + self, + mteb_model: Encoder, + task_name: str, + model_name: str = "mteb_wrapped_model", + save_embds: bool = False, + load_embds: bool = False, + batch_size: int = 16, + **kwargs, + ): + super().__init__(**kwargs) + self._load_embds = load_embds + self._save_embds = save_embds + # Keep the embeddings in memory by default. Set it to False for large corpus. + self.in_memory = True + self.is_query = False + self.save_file = None + + self.mteb_model_instance = mteb_model + self.model_name = model_name + self.task_name = task_name + self.batch_size = batch_size + self.query_instruct = "" # Add instructions if applicable + self.corpus_instruct = "" # Add instructions if applicable + self.embd_dim = None + self.embd_dtype = "float32" + + # Internal state + self.embds = None + self.local_embds = [] + self.local_existing_ids = set() + self.local_embd_file = None + + # --- Properties expected by run_retrieve_task --- + @property + def model(self): + return self + + # --- End Properties --- + + def encode(self, sentences: list[str], **kwargs) -> torch.Tensor: + """Encodes sentences using the wrapped MTEB model and returns torch.Tensor.""" + embeddings = self.mteb_model_instance.encode( + sentences, batch_size=self.batch_size, **kwargs + ) + if self.embd_dim is None and hasattr(embeddings, "shape"): + if len(embeddings.shape) >= 2: + self.embd_dim = embeddings.shape[1] + elif len(embeddings.shape) == 1 and embeddings.shape[0] == 0: + pass + else: + logger.warning( + f"Unexpected embedding shape: {embeddings.shape}. Cannot determine embd_dim." + ) + + if isinstance(embeddings, np.ndarray): + return torch.from_numpy(embeddings).to(torch.float32) + elif isinstance(embeddings, torch.Tensor): + return embeddings.to(torch.float32) + elif isinstance(embeddings, list): + if not embeddings: + dim = self.embd_dim if self.embd_dim is not None else 768 + return torch.empty((0, dim), dtype=torch.float32) + if isinstance(embeddings[0], np.ndarray): + return torch.from_numpy(np.stack(embeddings)).to(torch.float32) + elif isinstance(embeddings[0], torch.Tensor): + return torch.stack(embeddings).to(torch.float32) + else: + raise TypeError( + f"Unsupported embedding list element type: {type(embeddings[0])}" + ) + else: + raise TypeError( + f"Unsupported embedding type from MTEB model: {type(embeddings)}" + ) + + # --- Replicated predict hooks from RtebEncoder --- + def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None: + if not isinstance(batch, dict) or "id" not in batch or "text" not in batch: + logger.error( + f"Unsupported batch type or missing keys in predict_step: {type(batch)}" + ) + return + + indices = batch["id"] + sentences = batch["text"] + + if not indices or not sentences: + return + + if self.load_embds and self.local_existing_ids: + if all(idx in self.local_existing_ids for idx in indices): + return + if any(idx in self.local_existing_ids for idx in indices): + logger.warning( + "Partial loading within batch detected, but not supported. Re-encoding entire batch." + ) + + try: + embds = self.encode(sentences, task_name=self.task_name) + except Exception as e: + logger.error( + f"Encoding failed for batch_idx {batch_idx}: {e}", exc_info=True + ) + return + + for idx, embd in zip(indices, embds): + embd_list = embd.tolist() + obj = {"id": idx, "embd": embd_list} + + if self.in_memory: + if not (self.load_embds and idx in self.local_existing_ids): + self.local_embds.append(obj) + + if self.save_embds and self.local_embd_file: + if not (self.load_embds and idx in self.local_existing_ids): + try: + self.local_embd_file.write(json.dumps(obj) + "\n") + except Exception as e: + logger.error( + f"Failed to write embedding for ID {idx} to file: {e}" + ) + + def apply(self, fn): + # Override apply to prevent recursion into the wrapped mteb_model_instance + super().apply(fn) + return self + + # --- End Replicated Hooks --- + + +class AbsTaskRTEB(AbsTask): + """Abstract class for retrieval experiments.""" + + ignore_identical_ids: bool = False + abstask_prompt = "Retrieve text based on user query." + corpus: Dataset | None = None + queries: Dataset | None = None + relevant_docs: dict[str, dict[str, dict[str, int]]] | None = None + + def __init__(self, **kwargs): # Require hf_repo + self.rteb_dataset_name = kwargs.pop("rteb_dataset_name", None) + # Derive dataset name from task name if not provided + if self.rteb_dataset_name is None: + # Remove "RTEB" prefix from task name to get dataset name + self.rteb_dataset_name = self.metadata.name.replace("RTEB", "") + + self.hf_repo = f"embedding-benchmark/{self.rteb_dataset_name}" + self._hf_data_loader = HFDataLoader(hf_repo=self.hf_repo) + + super().__init__(**kwargs) + + def _validate_task_config(self): + """Validate task-specific configuration.""" + if not self.hf_repo: + raise ValueError( + f"HuggingFace repo is required for {self.__class__.__name__}" + ) + if not self.rteb_dataset_name: + raise ValueError( + f"RTEB dataset name is required for {self.__class__.__name__}" + ) + + @staticmethod + def create_rteb_task_metadata( + task_name: str, + dataset_name: str | None = None, + description: str | None = None, + reference: str | None = None, + dataset_path: str | None = None, + dataset_revision: str | None = None, + eval_langs: list[str] | None = None, + main_score: str = "ndcg_at_10", + domains: list[str] | None = None, + revision: str = "1.0.0", + date: tuple[str, str] | None = None, + license: str | None = None, + annotations_creators: str | None = None, + text_creation: str | None = None, + task_subtypes: list[str] | None = None, + dialect: list[str] | None = None, + bibtex_citation: str | None = None, + modalities: list[str] | None = None, + hf_subsets_to_langscripts: dict[str, list[str]] | None = None, + **kwargs: Any, + ) -> TaskMetadata: + """Factory function to create TaskMetadata for RTEB tasks with sensible defaults. + + This function simplifies the creation of TaskMetadata objects for RTEB tasks + by providing sensible defaults and deriving values where possible. + + Args: + task_name: Name of the task (e.g., "RTEBLegalQuAD") + dataset_name: Name of the dataset. If None, derived from task_name by removing "RTEB" prefix + description: Task description. If None, generated from dataset_name + reference: Reference URL for the dataset + dataset_path: HuggingFace dataset path. If None, defaults to "mteb/{dataset_name}" + dataset_revision: HuggingFace dataset revision + eval_langs: List of evaluation languages. Defaults to ["eng-Latn"] + main_score: Main evaluation metric. Defaults to "ndcg_at_10" + domains: List of domains the dataset belongs to + revision: Task revision string + date: Tuple of (start_date, end_date) for the dataset + license: Dataset license + annotations_creators: How annotations were created + text_creation: How text was created + task_subtypes: List of task subtypes + dialect: List of dialects + bibtex_citation: BibTeX citation for the dataset + modalities: List of modalities + hf_subsets_to_langscripts: Mapping of HF subsets to language scripts + **kwargs: Additional arguments to pass to TaskMetadata + + Returns: + TaskMetadata object configured for the RTEB task + """ + # Derive dataset name from task name if not provided + if dataset_name is None: + dataset_name = task_name.replace("RTEB", "") + + # Generate description if not provided + if description is None: + description = f"RTEB evaluation for {dataset_name} dataset." + + # Set default dataset path if not provided + if dataset_path is None: + dataset_path = f"mteb/{dataset_name}" + + # Set default date if not provided + if date is None: + date = ("2021-01-01", "2021-01-01") + + # Set default eval_langs if not provided + if eval_langs is None: + eval_langs = ["eng-Latn"] + + # Set default domains if not provided + if domains is None: + domains = [] + + # Set default task_subtypes if not provided + if task_subtypes is None: + task_subtypes = [] + + # Set default dialect if not provided + if dialect is None: + dialect = [] + + # Set default modalities if not provided + if modalities is None: + modalities = ["text"] + + # Set default hf_subsets_to_langscripts if not provided + if hf_subsets_to_langscripts is None: + hf_subsets_to_langscripts = {} + + # Create dataset dictionary + dataset_dict = {"path": dataset_path} + if dataset_revision: + dataset_dict["revision"] = dataset_revision + + # Create and return TaskMetadata + return TaskMetadata( + name=task_name, + description=description, + reference=reference, + dataset=dataset_dict, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=eval_langs, + main_score=main_score, + revision=revision, + date=date, + domains=domains, + license=license, + annotations_creators=annotations_creators, + text_creation=text_creation, + task_subtypes=task_subtypes, + dialect=dialect, + bibtex_citation=bibtex_citation, + modalities=modalities, + hf_subsets_to_langscripts=hf_subsets_to_langscripts, + **kwargs, + ) + + def load_data(self, **kwargs): + """Load data from HuggingFace.""" + if self.data_loaded: + return + + # Validate task configuration + self._validate_task_config() + + logger.info( + f"Loading data for {self.metadata.name} ({self.rteb_dataset_name}) from HuggingFace repo: {self.hf_repo}." + ) + + self.corpus, self.queries, self.relevant_docs = self._hf_data_loader.load() + + self.data_loaded = True + + def run_rteb_evaluation( + self, + model: Encoder, + hf_subset: HFSubset, + batch_size: int = 32, + **kwargs: Any, + ) -> ScoresDict: + """Runs the RTEB evaluation pipeline with pl.Trainer.""" + logger.info( + f"Starting RTEB evaluation via PL Runner: {self.metadata.name} ({self.rteb_dataset_name})..." + ) + + if hasattr(model, "mteb_model_meta"): + model_name = model.mteb_model_meta.name + else: + model_name = getattr(model, "model_name", "mteb_wrapped_model") + + # Configure Trainer + trainer_kwargs = { + "accelerator": kwargs.get("accelerator", "auto"), + "devices": kwargs.get("devices", "auto"), + "num_nodes": kwargs.get("num_nodes", 1), + "strategy": kwargs.get("strategy", "auto"), + "precision": kwargs.get("precision", "32-true"), + "logger": False, # Disable default logger + "enable_checkpointing": False, + "enable_progress_bar": True, + } + trainer = pl.Trainer(**trainer_kwargs) + + save_embds_flag = kwargs.get("save_embeddings", False) + load_embds_flag = kwargs.get("load_embeddings", False) + + rteb_encoder = MTEBToRTEBEncoderWrapper( + model, + task_name=self.metadata.name, + model_name=model_name, + save_embds=save_embds_flag, + load_embds=load_embds_flag, + batch_size=batch_size, + ) + rteb_encoder._trainer = trainer + + args = argparse.Namespace( + save_path=kwargs.get( + "output_folder", f"results/rteb_output/{self.rteb_dataset_name}" + ), + batch_size=kwargs.get("batch_size", batch_size), + embd_batch_size=kwargs.get("embd_batch_size", 128), + num_workers=kwargs.get("num_workers", 0), + embd_in_memory_threshold=kwargs.get("embd_in_memory_threshold", 100000), + overwrite=kwargs.get("overwrite_results", False), + load_embds=load_embds_flag, # Use the flag from kwargs + save_embds=save_embds_flag, # Use the flag from kwargs + ) + task_save_path = Path(args.save_path) / model_name + task_save_path.mkdir(parents=True, exist_ok=True) + rteb_cache_path = Path( + f"{os.path.expanduser('~')}/.cache/rteb/{self.rteb_dataset_name}/{model_name}" + ) + rteb_cache_path.mkdir(parents=True, exist_ok=True) + + # Check if results already exist + eval_file = rteb_cache_path / RETRIEVE_EVAL_FILENAME # Use consistent filename + if not args.overwrite and eval_file.exists(): + if trainer.is_global_zero: + logger.info( + f"Results already exist for {self.metadata.name} at {eval_file}. Skipping." + ) + with open(str(eval_file)) as f: + scores = json.load(f) + return scores + else: + # Non-global zero ranks should wait for global zero to finish + trainer.strategy.barrier() + with open(str(eval_file)) as f: + scores = json.load(f) + return scores + + # 1. Load Data using AbsTaskRTEB (already done by the task instance) + try: + query_dataloader = DataLoader( + self.queries, + batch_size=args.batch_size, + num_workers=args.num_workers, + collate_fn=None, + ) + + corpus_dataloader = DataLoader( + self.corpus, + batch_size=args.batch_size, + num_workers=args.num_workers, + collate_fn=None, + ) + + if trainer.is_global_zero: + logger.info(f"Queries size: {len(self.queries)}") + logger.info(f"Corpus size: {len(self.corpus)}") + + trainer.strategy.barrier() # Ensure data is prepared on all ranks + + if ( + len(self.queries) < trainer.num_devices + or len(self.corpus) < trainer.num_devices + ): + logger.warning("Skipping the task due to too few queries / documents.") + return {} + + if len(self.queries) >= 1e6: + logger.warning("Skipping the task due to too many queries.") + return {} + except Exception as e: + logger.error( + f"Failed to load data or create DataLoaders: {e}", + exc_info=True, + ) + return { + "main_score": 0.0, + self.metadata.main_score: 0.0, + "hf_subset": "default", + "languages": self.metadata.eval_langs, + } + + # 2. Encode Queries and Corpus using pl.Trainer + queries_embds_file = ( + task_save_path / QUERIES_EMBD_FILENAME + ) # Use consistent filename + corpus_embds_file = ( + task_save_path / CORPUS_EMBD_FILENAME + ) # Use consistent filename + + # Encode Queries + logger.info("Encoding queries") + rteb_encoder.is_query = True + rteb_encoder.in_memory = len(self.queries) < args.embd_in_memory_threshold + rteb_encoder.save_file = os.path.join(task_save_path, QUERIES_EMBD_FILENAME) + if args.load_embds and rteb_encoder.embd_files_exist(trainer.num_devices): + queries_embds_files = rteb_encoder.get_embd_files(trainer.num_devices) + logger.info(f"Embedding files exist: {queries_embds_files}") + queries_embd_ds = JSONLDataset( + queries_embds_files + ) # Create dataset directly + else: + logger.info(f"in_memory = {rteb_encoder.in_memory}") + logger.info(f"save_file = {rteb_encoder.save_file}") + trainer.predict( + model=rteb_encoder, dataloaders=query_dataloader + ) # Use the new dataloader + # Set the query embeddings + queries_embds_files = rteb_encoder.get_embd_files() + if rteb_encoder.in_memory: + queries_embd_ds = EmptyDataset( + rteb_encoder.embds + ) # Create dataset directly + else: + queries_embd_ds = JSONLDataset( + queries_embds_files + ) # Create dataset directly + trainer.strategy.barrier() # Ensure embeddings are ready on all ranks + + # Create queries_embd_dataloader + queries_embd_dataloader = DataLoader( + queries_embd_ds, + batch_size=args.embd_batch_size, + num_workers=args.num_workers, + collate_fn=EmbeddingDataCollator(), + ) + + # Encode Corpus + logger.info("Encoding corpus") + rteb_encoder.is_query = False + rteb_encoder.in_memory = len(self.corpus) < args.embd_in_memory_threshold + rteb_encoder.save_file = str(corpus_embds_file) + + if args.load_embds and corpus_embds_file.exists(): + if trainer.is_global_zero: + logger.info(f"Loading corpus embeddings from {corpus_embds_file}") + corpus_embd_ds = JSONLDataset( + [str(corpus_embds_file)] + ) # Create dataset directly + else: + if trainer.is_global_zero: + logger.info(f"in_memory = {rteb_encoder.in_memory}") + logger.info(f"save_file = {rteb_encoder.save_file}") + trainer.predict( + model=rteb_encoder, dataloaders=corpus_dataloader + ) # Use the new dataloader + if rteb_encoder.in_memory: + corpus_embd_ds = EmptyDataset( + rteb_encoder.embds + ) # Create dataset directly + else: + corpus_embd_ds = JSONLDataset( + [str(corpus_embds_file)] + ) # Create dataset directly + + trainer.strategy.barrier() # Ensure embeddings are ready on all ranks + + # Create corpus_embd_dataloader + corpus_embd_dataloader = DataLoader( + corpus_embd_ds, + batch_size=args.embd_batch_size, + num_workers=args.num_workers, + collate_fn=EmbeddingDataCollator(), + ) + + # 3. Manually Perform Retrieval + logger.info("Retrieve") + retriever_instance = Retriever(topk=100) # Instantiate Retriever + retriever_instance.corpus_embd_dataloader = ( + corpus_embd_dataloader # Use the new dataloader + ) + retriever_instance.in_memory = len(self.queries) < args.embd_in_memory_threshold + retriever_instance.save_file = str( + rteb_cache_path / RETRIEVE_PRED_FILENAME + ) # Use consistent filename + retriever_instance.save_prediction = True # Ensure prediction is saved + + trainer.predict( + model=retriever_instance, + dataloaders=queries_embd_dataloader, # Use the new dataloader + ) + + # Remove the embeddings if not saving + if not args.save_embds and not args.load_embds and trainer.is_global_zero: + if queries_embds_file.exists(): + os.remove(queries_embds_file) + if corpus_embds_file.exists(): + os.remove(corpus_embds_file) + + # 4. Run Evaluation + rteb_scores = {} + if trainer.is_global_zero: + try: + # Load predictions from the file saved by the retriever + prediction_file = rteb_cache_path / RETRIEVE_PRED_FILENAME + if not prediction_file.exists(): + logger.error(f"Prediction file not found at {prediction_file}") + raise FileNotFoundError( + f"Prediction file not found at {prediction_file}" + ) + + with open(str(prediction_file)) as f: + predictions = json.load(f) + + filtered_predictions = { + qid: scores + for qid, scores in predictions.items() + if qid in self.relevant_docs + } + if len(filtered_predictions) != len(self.relevant_docs): + logger.warning( + f"Number of queries in predictions ({len(filtered_predictions)}) does not match relevance data ({len(self.relevant_docs)}). Evaluating on intersection." + ) + filtered_relevance = { + qid: scores + for qid, scores in self.relevant_docs.items() + if qid in filtered_predictions + } + else: + filtered_relevance = self.relevant_docs + + if not filtered_predictions: + logger.error( + "No overlapping queries between predictions and relevance data." + ) + raise ValueError("No queries to evaluate.") + + rteb_scores = run_retrieve_evaluation( + filtered_relevance, filtered_predictions + ) + + logger.info("-" * 40) + logger.info(f"Dataset: {self.rteb_dataset_name}") + logger.info(f"Model: {model_name}") + logger.info(f"Save path: {task_save_path}") + logger.info("Retrieval evaluation:") + logger.info(rteb_scores) # Log the scores dictionary + + # 5. Format and Save Results + mteb_scores = dict(rteb_scores) + if self.metadata.main_score not in mteb_scores: + logger.warning( + f"Main score '{self.metadata.main_score}' not found in RTEB results." + ) + fallback_score = ( + next(iter(mteb_scores.values()), 0.0) if mteb_scores else 0.0 + ) + mteb_scores["main_score"] = fallback_score + else: + mteb_scores["main_score"] = mteb_scores[self.metadata.main_score] + + mteb_scores["model_name"] = model_name + if rteb_encoder.embd_dim: + mteb_scores["embd_dim"] = rteb_encoder.embd_dim + mteb_scores["embd_dtype"] = rteb_encoder.embd_dtype + + keys_to_remove = ["model_name", "embd_dim", "embd_dtype"] + final_scores = {} + for key, value in mteb_scores.items(): + if key not in keys_to_remove: + try: + final_scores[key] = float(value) + except (ValueError, TypeError): + logger.warning( + f"Could not convert score '{key}' to float. Skipping." + ) + + if "main_score" not in final_scores and "main_score" in mteb_scores: + try: + final_scores["main_score"] = float(mteb_scores["main_score"]) + except (ValueError, TypeError): + final_scores["main_score"] = 0.0 + + final_scores["hf_subset"] = ( + hf_subset if self.is_multilingual else "default" + ) + final_scores["languages"] = self.metadata.eval_langs + + with open(str(eval_file), "w") as f: + json.dump(final_scores, f) + logger.info(f"Results saved to: {eval_file}") + rteb_scores = final_scores # Return the final formatted scores + + except Exception as e: + logger.error( + f"Error during score calculation or saving: {e}", exc_info=True + ) + rteb_scores = { + "main_score": 0.0, + self.metadata.main_score: 0.0, + "hf_subset": hf_subset if self.is_multilingual else "default", + "languages": self.metadata.eval_langs, + } + + trainer.strategy.barrier() # Ensure global zero finishes saving before other ranks proceeds + + # If not global zero, wait for global zero to save and then load the results + if not trainer.is_global_zero: + if eval_file.exists(): + with open(str(eval_file)) as f: + rteb_scores = json.load(f) + else: + logger.error( + f"Evaluation file not found on non-global zero rank: {eval_file}" + ) + rteb_scores = { + "main_score": 0.0, + self.metadata.main_score: 0.0, + "hf_subset": hf_subset if self.is_multilingual else "default", + "languages": self.metadata.eval_langs, + } + + logger.info(f"Finished RTEB evaluation for {self.metadata.name}.") + return rteb_scores + + def evaluate( + self, + model, + split: str = "test", + subsets_to_run: list[HFSubset] | None = None, + *, + encode_kwargs: dict[str, Any] = {}, + **kwargs, + ) -> dict[HFSubset, ScoresDict]: + """Evaluate the model using the RTEB task runner.""" + if not self.data_loaded: + self.load_data() + + # RTEB tasks handle subsets internally based on dataset name + scores = {} + hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + if subsets_to_run is not None: + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + + for hf_subset in hf_subsets: + logger.info( + f"Task: {self.metadata.name}, split: {split}, subset: {hf_subset}. Running..." + ) + + scores[hf_subset] = self.run_rteb_evaluation( + model=model, + hf_subset=hf_subset, + encode_kwargs=encode_kwargs, + batch_size=16, + **kwargs, + ) + + return scores + + def _evaluate_subset( + self, + model: Encoder, + data_split: DatasetDict | Dataset, + encode_kwargs: dict[str, Any], + **kwargs: Any, + ): + """Evaluate a subset of the dataset. + + Warning: + This method is deprecated and will be removed in future versions. + Use RTEBTaskRunner.run_rteb_evaluation for evaluation logic. + + Delegates to the parent class implementation while issuing a deprecation warning. + """ + import warnings + + warnings.warn( + "_evaluate_subset is deprecated for RTEB tasks. Use RTEBTaskRunner.run_rteb_evaluation instead.", + DeprecationWarning, + stacklevel=2, + ) + return super()._evaluate_subset(model, data_split, encode_kwargs, **kwargs) + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ): + """Calculate metrics for a given split. + + Note: + This method exists only for API compatibility. Actual metric calculation + happens in RTEBTaskRunner.run_rteb_evaluation. This implementation: + 1. Logs a warning when called + 2. Returns empty ScoresDict to satisfy interface requirements + + Parameters: + split: Dataset split to evaluate (e.g., 'test') + hf_subset: Optional Hugging Face dataset subset name + compute_overall: Whether to compute overall metrics across subsets + + Returns: + ScoresDict: Empty dictionary to maintain interface compatibility + """ + logger.warning( + f"_calculate_metrics_from_split called for split {split}, but metrics are calculated by RTEBTaskRunner." + ) + return ScoresDict() diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index c283457273..5c569111fe 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -126,6 +126,7 @@ "Summarization", "InstructionRetrieval", "Speed", + "RTEB", ) + MIEB_TASK_TYPE TASK_TYPE = Literal[TASK_TYPE] diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py index 720f8747e8..6a6732181e 100644 --- a/mteb/abstasks/__init__.py +++ b/mteb/abstasks/__init__.py @@ -10,6 +10,7 @@ from .AbsTaskPairClassification import * from .AbsTaskReranking import * from .AbsTaskRetrieval import * +from .AbsTaskRTEB import * from .AbsTaskSpeedTask import * from .AbsTaskSTS import * from .AbsTaskSummarization import * diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index b1eb33442a..62eccb924e 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -368,6 +368,31 @@ def _batched_encode( public_training_data=None, ) +voyage_3_large = ModelMeta( + name="voyageai/voyage-3-large", # Use the identifier the user provided + revision="1", # Assuming revision 1 + release_date="2024-09-18", # Assuming same release as voyage-3 + languages=None, + loader=partial( # type: ignore + VoyageWrapper, + model_name="voyage-3-large", # Match the API model name + model_prompts=model_prompts, + ), + max_tokens=32000, # Assuming same as voyage-3 + embed_dim=1024, # Assuming same as voyage-3 + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license=None, + reference="https://blog.voyageai.com/2024/09/18/voyage-3/", # Assuming same reference + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + voyage_3_lite = ModelMeta( name="voyageai/voyage-3-lite", revision="1", diff --git a/mteb/rteb/__init__.py b/mteb/rteb/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mteb/rteb/ebr/__init__.py b/mteb/rteb/ebr/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mteb/rteb/ebr/__main__.py b/mteb/rteb/ebr/__main__.py deleted file mode 100644 index 149339eb39..0000000000 --- a/mteb/rteb/ebr/__main__.py +++ /dev/null @@ -1,220 +0,0 @@ -from __future__ import annotations - -import argparse -import json -import logging -import os -from collections import defaultdict -from pathlib import Path - -import pytorch_lightning as pl -from ebr.core import Encoder, Retriever -from ebr.datasets import DATASET_REGISTRY, DatasetMeta -from ebr.models import MODEL_REGISTRY, ModelMeta -from ebr.retrieve import run_retrieve_task -from pytorch_lightning.strategies.ddp import DDPStrategy - -logger = logging.getLogger(__name__) -os.environ["TOKENIZERS_PARALLELISM"] = "false" - - -def get_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - - # Evaluation - parser.add_argument( - "--gpus", type=int, default=0, help="Number of gpus used for encoding." - ) - parser.add_argument( - "--cpus", - type=int, - default=1, - help="Number of cpus used for computation (this is only for models that are not using gpus).", - ) - parser.add_argument("--bf16", action="store_true", help="`Use bf16 precision.") - parser.add_argument( - "--batch_size", type=int, default=16, help="Batch size for encoding." - ) - parser.add_argument( - "--embd_batch_size", - type=int, - default=1024, - help="Batch size for computing similarity of embeddings.", - ) - parser.add_argument( - "--embd_in_memory_threshold", - type=int, - default=200000, - help="Embeddings will be stored in memory if the amount is below this threshold.", - ) - - # Model - # parser.add_argument( - # "--model_name", type=str, default=None, help="Model name or path.") - # parser.add_argument( - # "--embd_dtype", type=str, default="float", help="Embedding type. Options: float32, int8, binary.") - # parser.add_argument( - # "--embd_dim", type=int, default=None, help="Embedding dimension.") - # parser.add_argument( - # "--max_length", type=int, default=None, help="Maximum length of model input.") - - # Data - parser.add_argument( - "--data_path", - type=str, - default="data/", - help="Path of the dataset, must be specified for custom tasks.", - ) - parser.add_argument( - "--task_name", - type=str, - default=None, - help="Name of the task. Can be multiple tasks splitted by `,`.", - ) - parser.add_argument( - "--data_type", - default="eval", - choices=["eval", "train", "chunk", "merge"], - help="Dataset type.", - ) - parser.add_argument( - "--num_workers", type=int, default=4, help="Number of workers for dataloader." - ) - - # Output - parser.add_argument( - "--save_path", type=str, default="output/", help="Path to save the output." - ) - parser.add_argument( - "--save_embds", action="store_true", help="Whether to save the embeddings." - ) - parser.add_argument( - "--load_embds", - action="store_true", - help="Whether to load the computed embeddings.", - ) - parser.add_argument( - "--save_prediction", - action="store_true", - help="Whether to save the predictions.", - ) - parser.add_argument( - "--topk", type=int, default=100, help="Number of top documents per query." - ) - parser.add_argument( - "--overwrite", action="store_true", help="Whether to overwrite the results." - ) - - args = parser.parse_args() - return args - - -def _dump_model_meta( - results_dir: str = "results", - model_registry: dict[str, ModelMeta] = MODEL_REGISTRY, -): - models = [meta.model_dump() for meta in model_registry.values()] - with open(Path(results_dir) / "models.json", "w") as f: - f.write(json.dumps(models, indent=4)) - - -def _dump_dataset_info( - results_dir: str = "results", - dataset_registry: dict[str, DatasetMeta] = DATASET_REGISTRY, -): - group_data = defaultdict(list) - for dataset_meta in dataset_registry.values(): - for group_name in dataset_meta.groups.keys(): - leaderboard = dataset_meta.loader.LEADERBOARD - group_data[(leaderboard, group_name)].append(dataset_meta.dataset_name) - - groups = [] - for (leaderboard, group_name), datasets in group_data.items(): - groups.append( - {"name": group_name, "datasets": datasets, "leaderboard": leaderboard} - ) - with open(Path(results_dir) / "datasets.json", "w") as f: - f.write(json.dumps(groups, indent=4)) - - -def _compile_results(results_dir: str = "results", output_dir: str = "output"): - results = [] - for dataset_output_dir in Path(output_dir).iterdir(): - dataset_results = [] - for one_result in dataset_output_dir.iterdir(): - eval_file = one_result / "retrieve_eval.json" - if eval_file.exists(): - with open(eval_file) as f: - dataset_results.append(json.load(f)) - - results.append( - { - **DATASET_REGISTRY[dataset_output_dir.name].model_dump(), - "results": dataset_results, - "is_closed": DATASET_REGISTRY[dataset_output_dir.name].tier != 3, - } - ) - - with open(Path(results_dir) / "results.json", "w") as f: - f.write(json.dumps(results, indent=4)) - - -def main(args: argparse.Namespace): - _dump_model_meta() - _dump_dataset_info() - - if args.gpus: - trainer = pl.Trainer( - strategy=DDPStrategy(find_unused_parameters=False), - accelerator="gpu", - devices=args.gpus, - precision="bf16" if args.bf16 else "32", - ) - else: - trainer = pl.Trainer( - strategy=DDPStrategy(), - accelerator="cpu", - devices=args.cpus, - ) - - if not trainer.is_global_zero: - logging.basicConfig(level=logging.ERROR) - - # Evaluate each model on the specified datasets - for model_meta in MODEL_REGISTRY.values(): - encoder = Encoder( - model_meta.load_model(), - save_embds=args.save_embds, - load_embds=args.load_embds, - ) - retriever = Retriever( - topk=args.topk, - similarity=model_meta.similarity, - save_prediction=args.save_prediction, - ) - - eval_results = {} - for dataset_meta in DATASET_REGISTRY.values(): - # if trainer.is_global_zero: - # trainer.print(f"Evaluating {model_meta.model_name} on {dataset_meta.dataset_name}") - - result = run_retrieve_task(dataset_meta, trainer, encoder, retriever, args) - eval_results[dataset_meta.dataset_name] = result - - metric = "ndcg_at_10" - - # Print the results - if trainer.is_global_zero: - trainer.print("=" * 40) - trainer.print(args.save_path) - trainer.print("=" * 40) - for task in eval_results.keys(): - if metric in eval_results[task]: - trainer.print(f"{task:<32}{eval_results[task][metric]:.4f}") - - _compile_results() - - -if __name__ == "__main__": - args = get_args() - main(args) diff --git a/mteb/rteb/ebr/core/__init__.py b/mteb/rteb/ebr/core/__init__.py deleted file mode 100644 index 9d48db4f9f..0000000000 --- a/mteb/rteb/ebr/core/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from __future__ import annotations diff --git a/mteb/rteb/ebr/core/base/__init__.py b/mteb/rteb/ebr/core/base/__init__.py deleted file mode 100644 index 9d48db4f9f..0000000000 --- a/mteb/rteb/ebr/core/base/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from __future__ import annotations diff --git a/mteb/rteb/ebr/core/base/dataset.py b/mteb/rteb/ebr/core/base/dataset.py deleted file mode 100644 index 1a80d96239..0000000000 --- a/mteb/rteb/ebr/core/base/dataset.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import annotations - -from abc import ABC -from functools import cache -from pathlib import Path -from typing import TYPE_CHECKING - -from torch.utils.data import Dataset - -if TYPE_CHECKING: - from ebr.core.meta import DatasetMeta - - -def add_instruct(dataset: Dataset, instruct: str, input_type: str): - for item in dataset.data: - if instruct: - item["text"] = instruct + item["text"] - item["input_type"] = input_type - - return dataset - - -class RetrievalDataset(ABC): - LEADERBOARD: str = None - - def __init__( - self, - data_path: str, - dataset_meta: DatasetMeta, - query_instruct: str | None = None, - corpus_instruct: str | None = None, - **kwargs, - ): - assert type(self).LEADERBOARD, "leaderboard must be defined" - super().__init__() - self._dataset_meta = dataset_meta - self._query_instruct = query_instruct - self._corpus_instruct = corpus_instruct - self._task_path = (Path(data_path) / dataset_meta.dataset_name).resolve() - - # def __getattr__(self, name: str) -> Any: - # try: - # return super().__getattr__(name) - # except AttributeError: - # return getattr(self._dataset_meta, name) - - @property - @cache - def corpus(self) -> Dataset: - corpus = self._corpus() - corpus = add_instruct(corpus, self._corpus_instruct, "document") - return corpus - - def _corpus(self) -> Dataset: - raise NotImplementedError - - @property - @cache - def queries(self) -> Dataset: - queries = self._queries() - queries = add_instruct(queries, self._query_instruct, "query") - return queries - - def _queries(self) -> Dataset: - raise NotImplementedError - - @property - @cache - def relevance(self) -> dict: - # Dict of dict: relevance[query_id][corpus_id] = score - pass - - def prepare_data(self): - _ = self.corpus - _ = self.queries - _ = self.relevance diff --git a/mteb/rteb/ebr/core/base/model.py b/mteb/rteb/ebr/core/base/model.py deleted file mode 100644 index 327bd66396..0000000000 --- a/mteb/rteb/ebr/core/base/model.py +++ /dev/null @@ -1,89 +0,0 @@ -from __future__ import annotations - -import logging -import time -from abc import ABC, abstractmethod -from types import NoneType -from typing import TYPE_CHECKING, Any - -import torch.nn as nn - -if TYPE_CHECKING: - from mteb.model_meta import ModelMeta - - -class EmbeddingModel(nn.Module, ABC): - """Base class for embedding models.""" - - def __init__(self, model_meta: ModelMeta, **kwargs): - super().__init__() - self._model_meta = model_meta - - @abstractmethod - def embed(self, data: Any, input_type: str) -> list[list[float]]: - pass - - def forward(self, batch: dict[str, Any]) -> list[list[float]]: - return self.embed(batch["text"], batch["input_type"][0]) - - def __getattr__(self, name: str) -> Any: - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self._model_meta, name) - - -class APIEmbeddingModel(EmbeddingModel): - """Base class for API-based embedding models.""" - - def __init__( - self, - model_meta: ModelMeta, - api_key: str | None = None, - num_retries: int | None = None, - **kwargs, - ): - super().__init__(model_meta, **kwargs) - self._api_key = api_key - assert num_retries is None or num_retries > 0, ( - "num_retries must be a positive integer" - ) - self._num_retries = num_retries - - @property - @abstractmethod - def client(self) -> Any: - pass - - def forward(self, batch: dict[str, Any]) -> list[list[float]]: - num_tries = 0 - while not self._num_retries or num_tries < self._num_retries: - try: - num_tries += 1 - result = super().forward(batch) - break - except Exception as e: - logging.error(e) - if isinstance(e, type(self).rate_limit_error_type()): - time.sleep(60) - elif isinstance(e, type(self).service_error_type()): - time.sleep(300) - else: - raise e - return result - - @property - def api_key(self) -> str: - return self._api_key - - @property - def num_retries(self) -> int: - return self._num_retries if self._num_retries else float("inf") - - @staticmethod - def rate_limit_error_type() -> type: - return NoneType - - @staticmethod - def service_error_type() -> type: - return NoneType diff --git a/mteb/rteb/ebr/core/data.py b/mteb/rteb/ebr/core/data.py deleted file mode 100644 index bfa3554782..0000000000 --- a/mteb/rteb/ebr/core/data.py +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import annotations - -import torch -from ebr.datasets import get_retrieval_dataset -from ebr.utils.data import EmptyDataset, JSONLDataset -from pytorch_lightning import LightningDataModule - - -class EmbeddingDataCollator: - def __call__(self, examples): - assert len(examples) > 0 - batch = { - key: [example[key] for example in examples] for key in examples[0].keys() - } - batch["embd"] = torch.tensor(batch["embd"]) - return batch - - -class RetrieveDataCollator: - def __init__(self, tokenizer=None): - self.tokenizer = tokenizer - self._early_truncate = True - - def __call__(self, examples): - assert len(examples) > 0 - batch = {} - batch["id"] = [ex["id"] for ex in examples] - batch["text"] = [ex["text"] for ex in examples] - - if self.tokenizer: - texts = [s.strip() for s in batch["text"]] - - if self._early_truncate: - max_str_len = self.tokenizer.model_max_length * 6 - texts = [s[:max_str_len] for s in texts] - - batch["input"] = self.tokenizer( - texts, - padding=True, - truncation=True, - return_tensors="pt", - ) - - return batch - - -class RetrieveDataModule(LightningDataModule): - def __init__( - self, - data_path: str, - dataset_name: str, - batch_size: int = 32, - embd_batch_size: int = 1024, - num_workers: int = 4, - dataset_kwargs: dict | None = None, - collator_kwargs: dict | None = None, - ): - super().__init__() - self.batch_size = batch_size - self.embd_batch_size = embd_batch_size - self.num_workers = num_workers - self.dataset = get_retrieval_dataset( - data_path=data_path, - dataset_name=dataset_name, - **dataset_kwargs, - ) - self.query_collator = None - self.corpus_collator = None - - def prepare_data(self): - self.dataset.prepare_data() - - def queries_dataloader(self): - return torch.utils.data.DataLoader( - self.dataset.queries, - batch_size=self.batch_size, - num_workers=self.num_workers, - collate_fn=self.query_collator, - ) - - def corpus_dataloader(self): - return torch.utils.data.DataLoader( - self.dataset.corpus, - batch_size=self.batch_size, - num_workers=self.num_workers, - collate_fn=self.corpus_collator, - ) - - def set_queries_embds(self, queries_embds=None, queries_embds_files=None): - if queries_embds: - self.queries_embds = queries_embds - self.queries_embd_ds = EmptyDataset(queries_embds) - else: - self.queries_embd_ds = JSONLDataset(queries_embds_files) - assert len(self.queries_embd_ds) == len(self.dataset.queries) - - def set_corpus_embds(self, corpus_embds=None, corpus_embds_files=None): - if corpus_embds: - self.corpus_embds = corpus_embds - self.corpus_embd_ds = EmptyDataset(corpus_embds) - else: - self.corpus_embd_ds = JSONLDataset(corpus_embds_files) - # TODO: check this assertion later, removed for chunk model - # assert len(self.corpus_embd_ds) == len(self.dataset.corpus) - - def queries_embd_dataloader(self): - return torch.utils.data.DataLoader( - self.queries_embd_ds, - batch_size=self.embd_batch_size, - num_workers=self.num_workers, - collate_fn=EmbeddingDataCollator(), - ) - - def corpus_embd_dataloader(self): - return torch.utils.data.DataLoader( - self.corpus_embd_ds, - batch_size=self.embd_batch_size, - num_workers=self.num_workers, - collate_fn=EmbeddingDataCollator(), - ) diff --git a/mteb/rteb/ebr/core/encoder.py b/mteb/rteb/ebr/core/encoder.py deleted file mode 100644 index 66eacf7367..0000000000 --- a/mteb/rteb/ebr/core/encoder.py +++ /dev/null @@ -1,122 +0,0 @@ -from __future__ import annotations - -import json -import logging -import os - -from ebr.core.base import EmbeddingModel -from ebr.utils.data import JSONLDataset -from ebr.utils.distributed import gather_list -from pytorch_lightning import LightningModule - -logger = logging.getLogger(__name__) - - -class Encoder(LightningModule): - def __init__( - self, - model: EmbeddingModel, - save_embds: bool = False, - load_embds: bool = False, - **kwargs, - ): - super().__init__() - self._model = model - self._load_embds = load_embds - self._save_embds = save_embds - # Keep the embeddings in memory by default. Set it to False for large corpus. - self.in_memory = True - self.is_query = False - self.save_file = None - - @property - def model(self) -> EmbeddingModel: - return self._model - - @property - def load_embds(self) -> bool: - return self._load_embds - - @property - def save_embds(self) -> bool: - # If in_memory=False, we have to save the embeddings - return self._save_embds or not self.in_memory - - @property - def local_embd_file_name(self) -> str: - assert self.save_file is not None - num_shards = self.trainer.num_devices - return f"{self.save_file}-{self.local_rank}-of-{num_shards}" - - def get_local_embd_files(self, num_shards=None) -> list[str]: - # Return local (intermediate) file names, which are jsonl files - assert self.save_file is not None - if num_shards is None: - num_shards = self.trainer.num_devices - return [f"{self.save_file}-{i}-of-{num_shards}" for i in range(num_shards)] - - def get_embd_files(self, num_shards=None) -> list[str]: - # Return the final file names, which are arrow files - local_files = self.get_local_embd_files(num_shards=num_shards) - return local_files - - def embd_files_exist(self, num_shards=None) -> bool: - files = self.get_embd_files(num_shards=num_shards) - return all(os.path.exists(file) for file in files) - - def on_predict_epoch_start(self): - self.embds = None - - if self.in_memory: - self.local_embds = [] - - if self.load_embds: - self.local_existing_ids = set() - if os.path.exists(self.local_embd_file_name): - logger.warning(f"Load embeddings from {self.local_embd_file_name}") - ds = JSONLDataset(self.local_embd_file_name) - for example in ds: - self.local_existing_ids.add(example["id"]) - if self.in_memory: - self.local_embds.append(example) - else: - logger.warning( - f"load_embds is True but {self.local_embd_file_name} doesn't exist. Skipping the loading." - ) - - if self.save_embds: - if self.load_embds: - # append to the file - self.local_embd_file = open(self.local_embd_file_name, "a") - else: - # rewrite the file - self.local_embd_file = open(self.local_embd_file_name, "w") - - def predict_step(self, batch, batch_idx): - indices = batch["id"] - - if self.load_embds and self.local_existing_ids: - masks = [id in self.local_existing_ids for id in indices] - num_existed = sum(masks) - if num_existed == len(indices): - return - elif num_existed > 0: - raise NotImplementedError( - "Partial loading within batch is not supported yet." - ) - - embds = self._model(batch) - - for idx, embd in zip(indices, embds): - obj = {"id": idx, "embd": embd} - if self.in_memory: - self.local_embds.append(obj) - if self.save_embds: - self.local_embd_file.write(json.dumps(obj) + "\n") - - def on_predict_epoch_end(self): - if self.save_embds: - self.local_embd_file.close() - if self.in_memory: - self.embds = gather_list(self.local_embds, self.trainer.num_devices) - self.trainer.strategy.barrier() diff --git a/mteb/rteb/ebr/core/meta.py b/mteb/rteb/ebr/core/meta.py deleted file mode 100644 index 8ad551aed8..0000000000 --- a/mteb/rteb/ebr/core/meta.py +++ /dev/null @@ -1,103 +0,0 @@ -from __future__ import annotations - -from typing import Any, Callable, Literal - -from ebr.core.base import EmbeddingModel, RetrievalDataset -from pydantic import BaseModel, ConfigDict - -# Tier 0: fully open (documents, queries, relevance) -# Tier 1: documents and queries released -# Tier 2: documents released -# Tier 3: fully held out -DATASET_TIER = Literal[0, 1, 2, 3] - -EMBEDDING_DTYPES = Literal["float32", "int8", "binary"] -SIMILARITY_METRICS = Literal["cosine", "dot"] - - -def dataset_id(dataset_name: str) -> str: - return f"{dataset_name}" - - -def model_id( - model_name: str, - embd_dtype: str, - embd_dim: int, -) -> str: - return f"{model_name.replace('/', '__')}_{embd_dtype}_{embd_dim}d" - - -class DatasetMeta(BaseModel): - """Dataset metadata object. - - Attributes: - TODO - """ - - model_config: ConfigDict = ConfigDict(protected_namespaces=()) - - loader: Callable[..., RetrievalDataset] - dataset_name: str - tier: DATASET_TIER = 3 - groups: dict[str, int] = {} - reference: str | None = None - - def model_dump(self, **kwargs) -> dict[str, Any]: - exclude = kwargs.pop("exclude", set()) | {"loader"} - return super().model_dump(exclude=exclude, **kwargs) - - def model_dump_json(self, **kwargs) -> dict[str, Any]: - exclude = kwargs.pop("exclude", set()) | {"loader"} - return super().model_dump_json(exclude=exclude, **kwargs) - - def load_dataset(self, data_path: str, **kwargs): - return self.loader(data_path, self, **kwargs) - - @property - def _id(self) -> str: - return dataset_id(self.dataset_name) - - -class ModelMeta(BaseModel): - """Model metadata object. Adapted from embeddings-benchmark/mteb/model_meta.py. - - Attributes: - loader: the function that loads the model. - name: The name of the model. - embd_dtype: The data type of the embeddings produced by the model, e.g. `float32`. - embd_dim: The dimension of the embeddings produced by the model, e.g. `1024`. - num_params: The number of parameters in the model, e.g. `7_000_000` for a 7M parameter model. - max_tokens: The maximum number of tokens the model can handle. - similarity: Similarity function, e.g. cosine, dot-product, etc. - query_instruct: Prompt to prepend to the input for queries. - corpus_instruct: Prompt to prepend to the input for documents. - """ - - model_config: ConfigDict = ConfigDict(protected_namespaces=()) - - loader: Callable[..., EmbeddingModel] - model_name: str - embd_dtype: EMBEDDING_DTYPES | None = None - embd_dim: int | None = None - num_params: int | None = None - max_tokens: int | None = None - similarity: SIMILARITY_METRICS | None = None - query_instruct: str | None = None - corpus_instruct: str | None = None - reference: str | None = None - alias: str | None = None - - def model_dump(self, **kwargs) -> dict[str, Any]: - exclude = kwargs.pop("exclude", set()) | {"loader"} - return super().model_dump(exclude=exclude, **kwargs) - - def model_dump_json(self, **kwargs) -> dict[str, Any]: - exclude = kwargs.pop("exclude", set()) | {"loader"} - return super().model_dump_json(exclude=exclude, **kwargs) - - def load_model(self, **kwargs) -> EmbeddingModel: - return self.loader(self, **kwargs) - - @property - def _id(self) -> str: - return model_id(self.model_name, self.embd_dtype, self.embd_dim) diff --git a/mteb/rteb/ebr/core/retriever.py b/mteb/rteb/ebr/core/retriever.py deleted file mode 100644 index ee502840f5..0000000000 --- a/mteb/rteb/ebr/core/retriever.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -import json -from collections import OrderedDict - -import torch -import torch.distributed as dist -from beir.retrieval.search.dense.util import cos_sim, dot_score -from pytorch_lightning import LightningModule - - -class Retriever(LightningModule): - def __init__( - self, - topk: int = 100, - similarity: str = "cosine", - save_prediction: bool = False, - ): - super().__init__() - self.topk = topk - if similarity == "cosine": - self.similarity_fn = cos_sim - self.largest = True - elif similarity == "dot": - self.similarity_fn = dot_score - self.largest = True - elif similarity == "euclidean": - self.similarity_fn = torch.cdist - self.largest = False - else: - raise ValueError(f"similarity {similarity} is invalid.") - self.in_memory = True - self.save_file = None - self.save_prediction = save_prediction - - @property - def local_prediction_file_name(self): - assert self.save_file is not None - num_shards = self.trainer.num_devices - return f"{self.save_file}-{self.local_rank}-of-{num_shards}" - - def get_local_prediction_files(self, num_shards=None): - assert self.save_file is not None - if num_shards is None: - num_shards = self.trainer.num_devices - return [f"{self.save_file}-{i}-of-{num_shards}" for i in range(num_shards)] - - def on_predict_epoch_start(self): - self.local_prediction = {} - - def predict_step(self, batch, batch_idx): - query_ids, query_embds = batch["id"], batch["embd"].float() - if isinstance(query_ids, torch.Tensor): - # TODO: change dataloader to support int id - raise NotImplementedError("id must be a string.") - corpus_ids = [] - batch_scores = [] - # Compute the similarity in batches - for corpus_batch in self.corpus_embd_dataloader: - corpus_ids += corpus_batch["id"] - corpus_embds = corpus_batch["embd"].float().to(query_embds.device) - scores = self.similarity_fn(query_embds, corpus_embds).cpu() - batch_scores.append(scores) - # Concat the scores and compute top-k - scores = torch.cat(batch_scores, dim=1) - if not self.largest: - scores = scores * -1 - topk = min(self.topk, len(corpus_ids)) - topk_scores, topk_ids = torch.topk(scores, topk, dim=1, largest=True) - topk_scores, topk_ids = topk_scores.tolist(), topk_ids.tolist() - for i, qid in enumerate(query_ids): - result = OrderedDict() - for j in range(topk): - cid = corpus_ids[topk_ids[i][j]] - result[cid] = topk_scores[i][j] - self.local_prediction[qid] = result - - def on_predict_epoch_end(self): - if self.trainer.num_devices > 1: - if self.in_memory: - gathered_prediction = [None] * self.trainer.num_devices - dist.all_gather_object(gathered_prediction, self.local_prediction) - self.prediction = { - k: v for preds in gathered_prediction for k, v in preds.items() - } - else: - with open(self.local_prediction_file_name, "w") as f: - json.dump(self.local_prediction, f) - self.trainer.strategy.barrier() - self.prediction = {} - if self.trainer.is_global_zero: - for file in self.get_local_prediction_files(): - with open(file) as f: - self.prediction.update(json.load(f)) - else: - self.prediction = self.local_prediction - - if self.save_prediction and self.trainer.is_global_zero: - assert self.save_file is not None - with open(self.save_file, "w") as f: - json.dump(self.prediction, f) diff --git a/mteb/rteb/ebr/datasets/__init__.py b/mteb/rteb/ebr/datasets/__init__.py deleted file mode 100644 index 8c00e69188..0000000000 --- a/mteb/rteb/ebr/datasets/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from __future__ import annotations - -from ebr.core.base import RetrievalDataset -from ebr.core.meta import DatasetMeta, dataset_id -from ebr.datasets.text import * -from ebr.utils.lazy_import import LazyImport - -DATASET_REGISTRY: dict[str, DatasetMeta] = {} -for name in dir(): - meta = eval(name) - # Explicitly exclude `LazyImport` instances since the latter check invokes the import. - if not isinstance(meta, LazyImport) and isinstance(meta, DatasetMeta): - DATASET_REGISTRY[meta._id] = eval(name) - - -def get_retrieval_dataset( - data_path: str, dataset_name: str, **kwargs -) -> RetrievalDataset: - key = dataset_id(dataset_name) - return DATASET_REGISTRY[key].load_dataset(data_path=data_path, **kwargs) diff --git a/mteb/rteb/ebr/datasets/text.py b/mteb/rteb/ebr/datasets/text.py deleted file mode 100644 index 85aab5be2e..0000000000 --- a/mteb/rteb/ebr/datasets/text.py +++ /dev/null @@ -1,214 +0,0 @@ -from __future__ import annotations - -import json -import os -from functools import cache - -from ebr.core.base import RetrievalDataset -from ebr.core.meta import DatasetMeta -from ebr.utils.data import JSONLDataset -from torch.utils.data import Dataset - - -class TextRetrievalDataset(RetrievalDataset): - LEADERBOARD: str = "Text" - - def __init__( - self, - data_path: str, - dataset_meta: DatasetMeta, - query_instruct: str | None = None, - corpus_instruct: str | None = None, - **kwargs, - ): - super().__init__( - data_path, - dataset_meta, - query_instruct=query_instruct, - corpus_instruct=corpus_instruct, - **kwargs, - ) - assert os.path.isdir(self._task_path), f"{self._task_path} is not a directory." - - @property - def corpus_file(self) -> str: - for name in ["corpus.jsonl", "corpus.arrow"]: - file = os.path.join(self._task_path, name) - if os.path.exists(file): - return file - raise FileNotFoundError( - f"Corpus file (corpus.{{jsonl/arrow}}) does not exist under {self._task_path}." - ) - - @cache - def _corpus(self) -> Dataset: - return JSONLDataset(self.corpus_file) - - @property - def queries_file(self) -> str: - for name in ["queries.jsonl", "queries.arrow"]: - file = os.path.join(self._task_path, name) - if os.path.exists(file): - return file - raise FileNotFoundError( - f"Queries file (queries.{{jsonl/arrow}}) does not exist under {self._task_path}." - ) - - @cache - def _queries(self) -> Dataset: - return JSONLDataset(self.queries_file) - - @property - def relevance_file(self) -> str: - for name in ["relevance.json", "relevance.jsonl"]: - file = os.path.join(self._task_path, name) - if os.path.exists(file): - return file - raise FileNotFoundError( - f"Relevance file (relevance.{{json/jsonl}}) does not exist under {self._task_path}." - ) - - @property - @cache - def relevance(self) -> dict: - relevant_docs = {} - try: - print(self.relevance_file) - with open(self.relevance_file) as f: - for line in f: - data = json.loads(line) - for key, value in data.items(): - if key not in relevant_docs: - relevant_docs[key] = value - else: - relevant_docs[key].update(value) - except FileNotFoundError: - return {} - return relevant_docs - - -# Legal datasets - -AILACasedocs = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="AILACasedocs", - tier=3, - groups={"text": 1, "legal": 1, "english": 1}, - reference=None, -) - -AILAStatutes = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="AILAStatutes", - tier=3, - groups={"text": 1, "legal": 1, "english": 1}, - reference=None, -) - -LegalSummarization = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="LegalSummarization", - tier=3, - groups={"text": 1, "legal": 1, "english": 1}, - reference=None, -) - -LegalQuAD = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="LegalQuAD", - tier=3, - groups={"text": 1, "legal": 1, "german": 1}, - reference=None, -) - - -# Finance datasets - -FinanceBench = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="FinanceBench", - tier=3, - groups={"text": 1, "finance": 1, "english": 1}, - reference=None, -) - -HC3Finance = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="HC3Finance", - tier=3, - groups={"text": 1, "finance": 1, "english": 1}, - reference=None, -) - -FinQA = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="FinQA", - tier=3, - groups={"text": 1, "finance": 1, "english": 1}, - reference=None, -) - - -# Code datasets - -APPS = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="APPS", - tier=3, - groups={"text": 1, "code": 1, "english": 1}, - reference=None, -) - -DS1000 = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="DS1000", - tier=3, - groups={"text": 1, "code": 1, "english": 1}, - reference=None, -) - -HumanEval = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="HumanEval", - tier=3, - groups={"text": 1, "code": 1}, - reference=None, -) - -MBPP = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="MBPP", - tier=3, - groups={"text": 1, "code": 1}, - reference=None, -) - -WikiSQL = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="WikiSQL", - tier=3, - groups={"text": 1, "code": 1, "english": 1}, - reference=None, -) - - -# Healthcare datasets - -ChatDoctor_HealthCareMagic = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="ChatDoctor_HealthCareMagic", - tier=3, - groups={"text": 1, "healthcare": 1, "english": 1}, - reference=None, -) - - -# Other/multilingual datasets - -FrenchBoolQ = DatasetMeta( - loader=TextRetrievalDataset, - dataset_name="FrenchBoolQ", - tier=3, - groups={"text": 1, "french": 1}, - reference=None, -) diff --git a/mteb/rteb/ebr/models/__init__.py b/mteb/rteb/ebr/models/__init__.py deleted file mode 100755 index 46426b51eb..0000000000 --- a/mteb/rteb/ebr/models/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -from __future__ import annotations - -from ebr.core.base import EmbeddingModel -from ebr.models.bgem3 import * -from ebr.models.cohere import * -from ebr.models.google import * -from ebr.models.gritlm import * -from ebr.models.openai import * -from ebr.models.sentence_transformers import * -from ebr.models.voyageai import * -from ebr.utils.lazy_import import LazyImport - -from mteb.model_meta import ModelMeta, model_id - -MODEL_REGISTRY: dict[str, ModelMeta] = {} -for name in dir(): - meta = eval(name) - # Explicitly exclude `LazyImport` instances since the latter check invokes the import. - if not isinstance(meta, LazyImport) and isinstance(meta, ModelMeta): - MODEL_REGISTRY[meta._id] = eval(name) - - -def get_embedding_model( - model_name: str, embd_dim: int, embd_dtype: str, **kwargs -) -> EmbeddingModel: - key = model_id(model_name, embd_dim, embd_dtype) - # TODO: add logic to dynamically load missing model - return MODEL_REGISTRY[key].load_model(**kwargs) diff --git a/mteb/rteb/ebr/models/bgem3.py b/mteb/rteb/ebr/models/bgem3.py deleted file mode 100644 index 960cc4cd19..0000000000 --- a/mteb/rteb/ebr/models/bgem3.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import annotations - -import os - -from ebr.core.base import EmbeddingModel -from ebr.utils.lazy_import import LazyImport - -if os.environ["USE_RTEB"]: - from ebr.core.meta import ModelMeta -else: - from mteb.model_meta import ModelMeta - -BGEM3FlagModel = LazyImport("FlagEmbedding", attribute="BGEM3FlagModel") - - -class BGEM3EmbeddingModel(EmbeddingModel): - def __init__(self, model_meta: ModelMeta, **kwargs): - super().__init__(model_meta, **kwargs) - self._model = BGEM3FlagModel( - model_name_or_path=model_meta.model_name, - ) - - def embed(self, data: list[str], input_type: str) -> list[list[float]]: - result = self._model.encode(sentences=data, batch_size=12)["dense_vecs"] - return [[float(str(x)) for x in result[i]] for i in range(len(result))] - - -bge_m3 = ModelMeta( - loader=BGEM3EmbeddingModel, - model_name="BAAI/bge-m3", - embd_dtype="float32", - embd_dim=1024, - max_tokens=8192, - similarity="cosine", - reference="https://huggingface.co/BAAI/bge-m3", -) -# -# bge_m3_unsupervised = ModelMeta( -# loader=BGEM3EmbeddingModel, -# model_name='BAAI/bge-m3-unsupervised', -# embd_dtype="float32", -# embd_dim=1024, -# max_tokens=8192, -# similarity="cosine", -# reference="https://huggingface.co/BAAI/bge-m3-unsupervised" -# ) -# -# bge_m3_retromae = ModelMeta( -# loader=BGEM3EmbeddingModel, -# model_name='BAAI/bge-m3-retromae', -# embd_dtype="float32", -# max_tokens=8192, -# similarity="cosine", -# reference="https://huggingface.co/BAAI/bge-m3-retromae" -# ) -# -# bge_large_en_v15 = ModelMeta( -# loader=BGEM3EmbeddingModel, -# model_name='BAAI/bge-large-en-v1.5', -# embd_dtype="float32", -# embd_dim=1024, -# max_tokens=512, -# similarity="cosine", -# reference="https://huggingface.co/BAAI/bge-large-en-v1.5" -# ) -# -# bge_base_en_v15 = ModelMeta( -# loader=BGEM3EmbeddingModel, -# model_name='BAAI/bge-base-en-v1.5', -# embd_dtype="float32", -# embd_dim=768, -# max_tokens=512, -# similarity="cosine", -# reference="https://huggingface.co/BAAI/bge-base-en-v1.5" -# ) -# -# bge_small_en_v15 = ModelMeta( -# loader=BGEM3EmbeddingModel, -# model_name='BAAI/bge-small-en-v1.5', -# embd_dtype="float32", -# embd_dim=384, -# max_tokens=512, -# similarity="cosine", -# reference="https://huggingface.co/BAAI/bge-small-en-v1.5" -# ) diff --git a/mteb/rteb/ebr/models/cohere.py b/mteb/rteb/ebr/models/cohere.py deleted file mode 100644 index 05e8351225..0000000000 --- a/mteb/rteb/ebr/models/cohere.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -import os -from typing import TYPE_CHECKING - -if os.environ["USE_RTEB"]: - from ebr.core.meta import ModelMeta -else: - from mteb.model_meta import ModelMeta - -from ebr.core.base import APIEmbeddingModel -from ebr.utils.lazy_import import LazyImport - -if TYPE_CHECKING: - import cohere -else: - cohere = LazyImport("cohere") - - -class CohereEmbeddingModel(APIEmbeddingModel): - def __init__( - self, - model_meta: ModelMeta, - api_key: str | None = None, - num_retries: int | None = None, - **kwargs, - ): - super().__init__(model_meta, api_key=api_key, num_retries=num_retries, **kwargs) - self._client = None - - @property - def client(self) -> cohere.ClientV2: - if not self._client: - self._client = cohere.ClientV2(api_key=self._api_key) - return self._client - - @property - def embedding_type(self) -> str: - if self.embd_dtype == "float32": - return "float" - else: - raise NotImplementedError - - def embed(self, data: str, input_type: str) -> list[list[float]]: - return getattr( - self.client.embed( - model=self.model_name, - texts=data, - input_type="search_query" - if input_type == "query" - else "search_document", - embedding_types=[self.embedding_type], - ).embeddings, - self.embedding_type, - ) - - @staticmethod - def rate_limit_error_type() -> type: - return cohere.errors.too_many_requests_error.TooManyRequestsError - - -""" -embed_multilingual_v3_0 = ModelMeta( - loader=CohereEmbeddingModel, - model_name="embed-multilingual-v3.0", - embd_dtype="float32", - embd_dim=1024, - max_tokens=512, - similarity="cosine", - reference="https://docs.cohere.com/v2/docs/cohere-embed" -) -""" diff --git a/mteb/rteb/ebr/models/google.py b/mteb/rteb/ebr/models/google.py deleted file mode 100644 index e916899f4d..0000000000 --- a/mteb/rteb/ebr/models/google.py +++ /dev/null @@ -1,82 +0,0 @@ -from __future__ import annotations - -import logging -import os -import time -from typing import Any - -from ebr.core.base import APIEmbeddingModel - -if os.environ["USE_RTEB"]: - from ebr.core.meta import ModelMeta -else: - from mteb.model_meta import ModelMeta - -from google import genai -from google.genai.errors import APIError -from google.genai.types import EmbedContentConfig - - -class GoogleEmbeddingModel(APIEmbeddingModel): - def __init__( - self, - model_meta: ModelMeta, - api_key: str | None = None, - num_retries: int | None = None, - **kwargs, - ): - super().__init__(model_meta, api_key=api_key, num_retries=num_retries, **kwargs) - self._client = None - - @property - def client(self) -> genai.Client: - if not self._client: - print("Initializing the client") - self._client = genai.Client(api_key=self._api_key) - return self._client - - def embed(self, data: Any, input_type: str) -> list[list[float]]: - response = self.client.models.embed_content( - model=self._model_meta.model_name, - contents=data, - config=EmbedContentConfig( - task_type="RETRIEVAL_QUERY" - if input_type == "query" - else "RETRIEVAL_DOCUMENT", - output_dimensionality=self.embd_dim, - ), - ) - return [embedding.values for embedding in response.embeddings] - - def forward(self, batch: dict[str, Any]) -> list[list[float]]: - num_tries = 0 - while not self._num_retries or num_tries < self._num_retries: - try: - num_tries += 1 - result = self.embed(batch["text"], batch["input_type"][0]) - return result - except Exception as e: - logging.error(e) - if isinstance(e, APIError): - if e.code == 429: - print("RLE") - time.sleep(60) - elif e.code >= 500: - print("Other error") - time.sleep(300) - else: - raise e - else: - raise e - raise Exception(f"Calling the API failed {num_tries} times") - - -text_embedding_004 = ModelMeta( - loader=GoogleEmbeddingModel, - model_name="text-embedding-004", - embd_dtype="float32", - embd_dim=768, - max_tokens=2048, - similarity="cosine", - reference="https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings", -) diff --git a/mteb/rteb/ebr/models/gritlm.py b/mteb/rteb/ebr/models/gritlm.py deleted file mode 100644 index ed38f28017..0000000000 --- a/mteb/rteb/ebr/models/gritlm.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -import os - -from ebr.core.base import EmbeddingModel -from ebr.utils.lazy_import import LazyImport - -if os.environ["USE_RTEB"]: - from ebr.core.meta import ModelMeta -else: - from mteb.model_meta import ModelMeta - -GritLM = LazyImport("gritlm", attribute="GritLM") - - -class GRITLMEmbeddingModel(EmbeddingModel): - def __init__(self, model_meta: ModelMeta, **kwargs): - super().__init__(model_meta, **kwargs) - self._model = GritLM( - model_name_or_path="GritLM/GritLM-7B", - normalized=False, - torch_dtype=model_meta.embd_dtype, - mode="embedding", - ) - - def embed(self, data: list[str], input_type: str) -> list[list[float]]: - result = self._model.encode(sentences=data) - return [[float(str(x)) for x in result[i]] for i in range(len(result))] - - -""" -gritlm_7b = ModelMeta( - loader=GRITLMEmbeddingModel, - model_name="GritLM/GritLM-7B", - embd_dtype="float32", - embd_dim=384, - num_params=7_240_000, - similarity="cosine", - reference="https://huggingface.co/GritLM/GritLM-7B" -) - -gritlm_8x7b = ModelMeta( - loader=GRITLMEmbeddingModel, - model_name="GritLM/GritLM-8x7B", - embd_dtype="float32", - embd_dim=384, - num_params=46_700_000, - similarity="cosine", - reference="https://huggingface.co/GritLM/GritLM-8x7B" -) -""" diff --git a/mteb/rteb/ebr/models/openai.py b/mteb/rteb/ebr/models/openai.py deleted file mode 100644 index a242656096..0000000000 --- a/mteb/rteb/ebr/models/openai.py +++ /dev/null @@ -1,108 +0,0 @@ -from __future__ import annotations - -import os -from typing import TYPE_CHECKING - -if os.environ["USE_RTEB"]: - from ebr.core.meta import ModelMeta -else: - from mteb.model_meta import ModelMeta - -from ebr.core.base import APIEmbeddingModel -from ebr.utils.lazy_import import LazyImport - -if TYPE_CHECKING: - import openai - import tiktoken -else: - openai = LazyImport("openai") - tiktoken = LazyImport("tiktoken") - - -class OpenAIEmbeddingModel(APIEmbeddingModel): - def __init__( - self, - model_meta: ModelMeta, - api_key: str | None = None, - num_retries: int | None = None, - **kwargs, - ): - super().__init__(model_meta, api_key=api_key, num_retries=num_retries, **kwargs) - self._client = None - self._tokenizer = None - - @property - def client(self) -> openai.OpenAI: - if not self._client: - self._client = openai.OpenAI(api_key=self._api_key) - return self._client - - @property - def tokenizer(self): - if not self._tokenizer: - self._tokenizer = tiktoken.get_encoding("cl100k_base") - return self._tokenizer - - def embed(self, data: str, input_type: str) -> list[list[float]]: - tokens = [self.tokenizer.encode(text, disallowed_special=()) for text in data] - if self.max_tokens: - for n, tok in enumerate(tokens): - if len(tok) > self.max_tokens: - tokens[n] = tok[: self.max_tokens] - result = self.client.embeddings.create( - input=tokens, model=self.model_name, dimensions=self.embd_dim - ) - embeddings = [d.embedding for d in result.data] - return embeddings - - @staticmethod - def rate_limit_error_type() -> type: - return openai.RateLimitError - - @staticmethod - def service_error_type() -> type: - return openai.InternalServerError - - -text_embedding_3_large = ModelMeta( - loader=OpenAIEmbeddingModel, - model_name="text-embedding-3-large", - embd_dtype="float32", - embd_dim=3072, - max_tokens=8191, - similarity="cosine", - reference="https://platform.openai.com/docs/guides/embeddings", -) - - -text_embedding_3_large_512d = ModelMeta( - loader=OpenAIEmbeddingModel, - model_name="text-embedding-3-large", - embd_dtype="float32", - embd_dim=512, - max_tokens=8191, - similarity="cosine", - reference="https://platform.openai.com/docs/guides/embeddings", -) - - -text_embedding_3_small = ModelMeta( - loader=OpenAIEmbeddingModel, - model_name="text-embedding-3-small", - embd_dtype="float32", - embd_dim=1536, - max_tokens=8191, - similarity="cosine", - reference="https://platform.openai.com/docs/guides/embeddings", -) - - -text_embedding_3_small_512d = ModelMeta( - loader=OpenAIEmbeddingModel, - model_name="text-embedding-3-small", - embd_dtype="float32", - embd_dim=512, - max_tokens=8191, - similarity="cosine", - reference="https://platform.openai.com/docs/guides/embeddings", -) diff --git a/mteb/rteb/ebr/models/sentence_transformers.py b/mteb/rteb/ebr/models/sentence_transformers.py deleted file mode 100644 index c29193d4bf..0000000000 --- a/mteb/rteb/ebr/models/sentence_transformers.py +++ /dev/null @@ -1,134 +0,0 @@ -from __future__ import annotations - -import os - -from ebr.core.base import EmbeddingModel -from ebr.utils.lazy_import import LazyImport - -if os.environ["USE_RTEB"]: - from ebr.core.meta import ModelMeta -else: - from mteb.model_meta import ModelMeta - -SentenceTransformer = LazyImport( - "sentence_transformers", attribute="SentenceTransformer" -) - - -class SentenceTransformersEmbeddingModel(EmbeddingModel): - def __init__(self, model_meta: ModelMeta, **kwargs): - super().__init__(model_meta, **kwargs) - self._model = SentenceTransformer( - f"{self.model_name_prefix}/{self.model_name}", trust_remote_code=True - ) - - def embed(self, data: str, input_type: str) -> list[list[float]]: - return self._model.encode(data) - - @property - def model_name_prefix(self) -> str: - return "sentence-transformers" - - @property - def _id(self) -> str: - return f"{self.model_name_prefix}__{self._model_meta._id}" - - -class E5EmbeddingModel(SentenceTransformersEmbeddingModel): - @property - def model_name_prefix(self) -> str: - return "intfloat" - - -all_MiniLM_L6_v2 = ModelMeta( - loader=SentenceTransformersEmbeddingModel, - model_name="all-MiniLM-L6-v2", - embd_dtype="float32", - embd_dim=384, - num_params=22_700_000, - max_tokens=256, - similarity="cosine", - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2", -) - -# e5_mistral_7b_instruct = ModelMeta( -# loader=SentenceTransformersEmbeddingModel, -# model_name="e5-mistral-7b-instruct", -# embd_dtype="float32", -# embd_dim=4096, -# similarity="cosine", -# reference="https://huggingface.co/intfloat/e5-mistral-7b-instruct" -# ) - -""" -all_MiniLM_L12_v2 = ModelMeta( - loader=SentenceTransformersEmbeddingModel, - model_name="sentence-transformers/all-MiniLM-L12-v2", - embd_dtype="float32", - embd_dim=384, - num_params=33_400_000, - max_tokens=256, - similarity="cosine", - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2" -) - - -labse = ModelMeta( - loader=SentenceTransformersEmbeddingModel, - model_name="sentence-transformers/LaBSE", - embd_dtype="float32", - embd_dim=768, - num_params=471_000_000, - max_tokens=512, - similarity="cosine", - reference="https://huggingface.co/sentence-transformers/LaBSE" -) - - -multi_qa_MiniLM_L6_cos_v1 = ModelMeta( - loader=SentenceTransformersEmbeddingModel, - model_name="sentence-transformer/multi-qa-MiniLM-L6-cos-v1", - embd_dtype="float32", - embd_dim=384, - num_params=22_700_000, - max_tokens=512, - similarity="cosine", - reference="https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1" -) - - -all_mpnet_base_v2 = ModelMeta( - loader=SentenceTransformersEmbeddingModel, - model_name="sentence-transformers/all-mpnet-base-v2", - embd_dtype="float32", - embd_dim=768, - num_params=109_000_000, - max_tokens=384, - similarity="cosine", - reference="https://huggingface.co/sentence-transformers/all-mpnet-base-v2" -) - - -jina_embeddings_v2_base_en = ModelMeta( - loader=SentenceTransformersEmbeddingModel, - model_name="jinaai/jina-embeddings-v2-base-en", - embd_dtype="float32", - embd_dim=768, - num_params=137_000_000, - max_tokens=8192, - similarity="cosine", - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en" -) - - -jina_embeddings_v2_small_en = ModelMeta( - loader=SentenceTransformersEmbeddingModel, - model_name="jinaai/jina-embeddings-v2-small-en", - embd_dtype="float32", - embd_dim=512, - num_params=32_700_000, - max_tokens=8192, - similarity="cosine", - reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en" -) -""" diff --git a/mteb/rteb/ebr/models/voyageai.py b/mteb/rteb/ebr/models/voyageai.py deleted file mode 100644 index 7eabf10094..0000000000 --- a/mteb/rteb/ebr/models/voyageai.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import annotations - -import os -from typing import TYPE_CHECKING, Any - -from ebr.core.base import APIEmbeddingModel - -if os.environ["USE_RTEB"]: - from ebr.core.meta import ModelMeta -else: - from mteb.model_meta import ModelMeta -from ebr.utils.lazy_import import LazyImport - -if TYPE_CHECKING: - import voyageai -else: - voyageai = LazyImport("voyageai") - - -class VoyageAIEmbeddingModel(APIEmbeddingModel): - def __init__( - self, - model_meta: ModelMeta, - api_key: str | None = None, - num_retries: int | None = None, - **kwargs, - ): - super().__init__(model_meta, api_key=api_key, num_retries=num_retries, **kwargs) - self._client = None - - @property - def client(self) -> voyageai.Client: - if not self._client: - self._client = voyageai.Client(api_key=self._api_key) - return self._client - - def embed(self, data: Any, input_type: str) -> list[list[float]]: - result = self.client.embed( - data, model=self.model_name, output_dimension=self.embd_dim, input_type=None - ) - return result.embeddings - - @staticmethod - def rate_limit_error_type() -> type: - return voyageai.error.RateLimitError - - @staticmethod - def service_error_type() -> type: - return voyageai.error.ServiceUnavailableError - - -voyage_3 = ModelMeta( - loader=VoyageAIEmbeddingModel, - model_name="voyage-3", - embd_dtype="float32", - embd_dim=1024, - max_tokens=32_000, - similarity="cosine", - query_instruct="Represent the query for retrieving supporting documents: ", - corpus_instruct="Represent the document for retrieval: ", - reference="https://docs.voyageai.com/docs/embeddings", -) diff --git a/mteb/rteb/ebr/retrieve.py b/mteb/rteb/ebr/retrieve.py deleted file mode 100644 index 7c9549dcfe..0000000000 --- a/mteb/rteb/ebr/retrieve.py +++ /dev/null @@ -1,177 +0,0 @@ -from __future__ import annotations - -import argparse -import json -import os -from pathlib import Path - -import pytorch_lightning as pl -from beir.retrieval.evaluation import EvaluateRetrieval -from ebr.core import Encoder -from ebr.core.data import RetrieveDataModule -from ebr.core.meta import DatasetMeta -from termcolor import colored - -CORPUS_EMBD_FILENAME = "corpus_embds.jsonl" -QUERIES_EMBD_FILENAME = "queries_embds.jsonl" -RETRIEVE_EVAL_FILENAME = "retrieve_eval.json" -RETRIEVE_PRED_FILENAME = "retrieve_pred.json" - - -def run_retrieve_evaluation(relevance, prediction): - if len(relevance) != len(prediction): - raise RuntimeError("Prediction and ground truth have different sizes.") - - ndcg, _map, recall, precision = EvaluateRetrieval.evaluate( - relevance, - prediction, - k_values=[1, 3, 5, 10, 20, 50, 100], - ignore_identical_ids=False, - ) - scores = { - **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, - **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, - **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, - **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, - } - return scores - - -def run_retrieve_task( - dataset_meta: DatasetMeta, - trainer: pl.Trainer, - encoder: Encoder, - retriever: pl.LightningModule, - args: argparse.Namespace, -): - dataset_name = dataset_meta.dataset_name - - task_save_path = Path(args.save_path) / dataset_name / encoder.model._id - task_save_path.mkdir(parents=True, exist_ok=True) - - if not args.overwrite: - eval_file = task_save_path / RETRIEVE_EVAL_FILENAME - pred_file = task_save_path / RETRIEVE_PRED_FILENAME - if eval_file.exists(): - with open(str(eval_file)) as f: - scores = json.load(f) - return scores - else: - if pred_file.exists(): - return - - # DataModule manages the datasets - dataset_kwargs = { - "query_instruct": encoder.model.query_instruct, - "corpus_instruct": encoder.model.corpus_instruct, - } - collator_kwargs = {} - - dm = RetrieveDataModule( - data_path=args.data_path, - dataset_name=dataset_name, - batch_size=args.batch_size, - embd_batch_size=args.embd_batch_size, - num_workers=args.num_workers, - dataset_kwargs=dataset_kwargs, - collator_kwargs=collator_kwargs, - ) - if trainer.is_global_zero: - dm.prepare_data() - trainer.print("Queries size:", len(dm.dataset.queries)) - trainer.print("Corpus size:", len(dm.dataset.corpus)) - - trainer.strategy.barrier() - - if ( - len(dm.dataset.queries) < trainer.num_devices - or len(dm.dataset.corpus) < trainer.num_devices - ): - trainer.print( - colored("Skipping the task due to too few queries / documents.", "red") - ) - return {} - - if len(dm.dataset.queries) >= 1e6: - trainer.print(colored("Skipping the task due to too many queries.", "red")) - return {} - - if dataset_name == "bm25": - # Build the index from corpus - retriever.build_index(dm.dataset.corpus) - # Compute the scores for queries - retriever.save_file = os.path.join(task_save_path, RETRIEVE_PRED_FILENAME) - trainer.predict(model=retriever, dataloaders=dm.queries_dataloader()) - - else: - # Compute the query embeddings - trainer.print(colored("Encode queries", "yellow")) - encoder.is_query = True - encoder.in_memory = len(dm.dataset.queries) < args.embd_in_memory_threshold - encoder.save_file = os.path.join(task_save_path, QUERIES_EMBD_FILENAME) - if args.load_embds and encoder.embd_files_exist(trainer.num_devices): - queries_embds_files = encoder.get_embd_files(trainer.num_devices) - trainer.print(f"Embedding files exist: {queries_embds_files}") - dm.set_queries_embds(queries_embds_files=queries_embds_files) - else: - trainer.print(f"in_memory = {encoder.in_memory}") - trainer.print(f"save_file = {encoder.save_file}") - trainer.predict(model=encoder, dataloaders=dm.queries_dataloader()) - # Set the query embeddings - queries_embds_files = encoder.get_embd_files() - dm.set_queries_embds( - queries_embds=encoder.embds, queries_embds_files=queries_embds_files - ) - - # Compute the corpus embeddings - trainer.print(colored("Encode corpus", "yellow")) - encoder.is_query = False - encoder.save_file = os.path.join(task_save_path, CORPUS_EMBD_FILENAME) - encoder.in_memory = len(dm.dataset.corpus) < args.embd_in_memory_threshold - if args.load_embds and encoder.embd_files_exist(trainer.num_devices): - corpus_embds_files = encoder.get_embd_files(trainer.num_devices) - trainer.print(f"Embedding files exist: {corpus_embds_files}") - dm.set_corpus_embds(corpus_embds_files=corpus_embds_files) - else: - trainer.print(f"in_memory = {encoder.in_memory}") - trainer.print(f"save_file = {encoder.save_file}") - trainer.predict(model=encoder, dataloaders=dm.corpus_dataloader()) - # Set the corpus embeddings - corpus_embds_files = encoder.get_embd_files() - dm.set_corpus_embds( - corpus_embds=encoder.embds, corpus_embds_files=corpus_embds_files - ) - - # Run retriever - trainer.print(colored("Retrieve", "yellow")) - retriever.corpus_embd_dataloader = dm.corpus_embd_dataloader() - retriever.in_memory = len(dm.dataset.queries) < args.embd_in_memory_threshold - retriever.save_file = os.path.join(task_save_path, RETRIEVE_PRED_FILENAME) - trainer.predict(model=retriever, dataloaders=dm.queries_embd_dataloader()) - - # Remove the embeddings - if not args.save_embds and not args.load_embds and trainer.is_global_zero: - for file in queries_embds_files + corpus_embds_files: - if os.path.exists(file): - os.remove(file) - - # Run evaluation - if trainer.is_global_zero: - scores = run_retrieve_evaluation(dm.dataset.relevance, retriever.prediction) - trainer.print("-" * 40) - trainer.print("Dataset:", colored(f"{dataset_name}", "red")) - trainer.print("Model:", colored(f"{encoder.model.model_name}", "red")) - trainer.print("Save path:", colored(task_save_path, "yellow")) - trainer.print("Retrieval evaluation:") - trainer.print(scores) - scores |= { - "model_name": encoder.model.model_name, - "embd_dim": encoder.model.embd_dim, - "embd_dtype": encoder.model.embd_dtype, - } - with open(os.path.join(task_save_path, RETRIEVE_EVAL_FILENAME), "w") as f: - json.dump(scores, f) - trainer.print(os.path.join(task_save_path, RETRIEVE_EVAL_FILENAME)) - return scores - - return diff --git a/mteb/rteb/ebr/utils/__init__.py b/mteb/rteb/ebr/utils/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mteb/rteb/ebr/utils/data.py b/mteb/rteb/ebr/utils/data.py deleted file mode 100644 index 9032dbef4d..0000000000 --- a/mteb/rteb/ebr/utils/data.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import annotations - -import json - -from torch.utils.data import Dataset - - -class EmptyDataset(Dataset): - def __init__(self, data, transform=None): - self.transform = transform - self.data = data - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - item = self.data[idx] - - # Optionally apply any transformations - if self.transform: - item = self.transform(item) - - return item - - -class JSONLDataset(Dataset): - def __init__(self, file_path, transform=None): - self.file_path = file_path - self.transform = transform - self.data = [] - - # Load data from JSONL file - if isinstance(file_path, str): - with open(file_path) as f: - for line in f: - self.data.append(json.loads(line)) - elif isinstance(file_path, list): - for path in file_path: - with open(path) as f: - for line in f: - self.data.append(json.loads(line)) - else: - raise ValueError("file_path must be a string or a list of strings.") - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - item = self.data[idx] - - # Optionally apply any transformations - if self.transform: - item = self.transform(item) - - return item diff --git a/mteb/rteb/ebr/utils/distributed.py b/mteb/rteb/ebr/utils/distributed.py deleted file mode 100644 index 7fa5e2026f..0000000000 --- a/mteb/rteb/ebr/utils/distributed.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations - -import torch.distributed as dist - - -def gather_list(data: list, num_devices: int): - """Gather list data and merge them into a list.""" - if num_devices == 1: - return data - gathered = [None] * num_devices - dist.all_gather_object(gathered, data) - gathered = sum(gathered, []) - return gathered diff --git a/mteb/rteb/ebr/utils/lazy_import.py b/mteb/rteb/ebr/utils/lazy_import.py deleted file mode 100644 index 4105b81669..0000000000 --- a/mteb/rteb/ebr/utils/lazy_import.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import annotations - -import importlib -import importlib.util -from types import ModuleType -from typing import Any - - -def prompt_install(package: str, version: str | None = None) -> bool: - """Checks whether the user wants to install a module before proceeding.""" - raise ModuleNotFoundError( - f"{package}{'==' + version if version else ''} not found." - ) - - -class LazyImport(ModuleType): - """Lazily import a module to avoid unnecessary dependencies. If a required - dependency does not exist, it will prompt the user for it. - - Adapted from fzliu/radient/utils/lazy_loader.py. - """ - - def __init__( - self, - name: str, - attribute: str | None = None, - package_name: str | None = None, - min_version: str | None = None, - ): - super().__init__(name) - self._attribute = attribute - self._top_name = name.split(".")[0] - self._package_name = package_name if package_name else self._top_name - self._min_version = min_version - self._module = None - - def __call__(self, *args, **kwargs) -> Any: - return self._evaluate()(*args, **kwargs) - - def __getattr__(self, attribute: str) -> Any: - return getattr(self._evaluate(), attribute) - - def __dir__(self) -> list: - return dir(self._evaluate()) - - def _evaluate(self) -> ModuleType: - if not self._module: - if not importlib.util.find_spec(self._top_name): - prompt_install(self._package_name, self._min_version) - self._module = importlib.import_module(self.__name__) - if self._min_version and self._module.__version__ < self._min_version: - prompt_install(self._package_name, self._min_version) - self._module = importlib.import_module(self.__name__) - if self._attribute: - return getattr(self._module, self._attribute) - return self._module diff --git a/mteb/tasks/RTEB/RTEBAILACasedocsTask.py b/mteb/tasks/RTEB/RTEBAILACasedocsTask.py new file mode 100644 index 0000000000..1cefda9e8a --- /dev/null +++ b/mteb/tasks/RTEB/RTEBAILACasedocsTask.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBAILACasedocs(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBAILACasedocs", + "description": "RTEB evaluation for AILACasedocs dataset.", + "reference": "https://zenodo.org/records/4063986", + "dataset_path": "zenodo/4063986", # Using Zenodo DOI as path + "dataset_revision": "4106e6bcc72e0698d714ea8b101355e3e238431a", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # Date not specified in dataset metadata + "domains": ["Legal"], + "task_subtypes": ["Article retrieval"], + "license": "cc-by-4.0", # Standardized license format + "bibtex_citation": """@dataset{paheli_bhattacharya_2020_4063986, + author = {Paheli Bhattacharya and + Kripabandhu Ghosh and + Saptarshi Ghosh and + Arindam Pal and + Parth Mehta and + Arnab Bhattacharya and + Prasenjit Majumder}, + title = {AILA 2019 Precedent & Statute Retrieval Task}, + month = oct, + year = 2020, + publisher = {Zenodo}, + doi = {10.5281/zenodo.4063986}, + url = {https://doi.org/10.5281/zenodo.4063986} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + # Allow configuration via environment variable or default to the original path + super().__init__(rteb_dataset_name="AILACasedocs", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBAILAStatutesTask.py b/mteb/tasks/RTEB/RTEBAILAStatutesTask.py new file mode 100644 index 0000000000..1d946558b0 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBAILAStatutesTask.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBAILAStatutes(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBAILAStatutes", + "description": "RTEB evaluation for AILAStatutes dataset.", + "reference": "https://zenodo.org/records/4063986", + "dataset_path": "zenodo/4063986", # Using Zenodo DOI as path + "dataset_revision": "ebfcd844eadd3d667efa3c57fc5c8c87f5c2867e", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # Date not specified in dataset metadata + "domains": ["Legal"], + "task_subtypes": ["Article retrieval"], + "license": "cc-by-4.0", # Standardized license format + "bibtex_citation": """@dataset{paheli_bhattacharya_2020_4063986, + author = {Paheli Bhattacharya and + Kripabandhu Ghosh and + Saptarshi Ghosh and + Arindam Pal and + Parth Mehta and + Arnab Bhattacharya and + Prasenjit Majumder}, + title = {AILA 2019 Precedent & Statute Retrieval Task}, + month = oct, + year = 2020, + publisher = {Zenodo}, + doi = {10.5281/zenodo.4063986}, + url = {https://doi.org/10.5281/zenodo.4063986} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="AILAStatutes", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBAPPSTask.py b/mteb/tasks/RTEB/RTEBAPPSTask.py new file mode 100644 index 0000000000..aa30d7a1f4 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBAPPSTask.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBAPPS(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBAPPS", + "description": "RTEB evaluation for APPS dataset.", + "reference": "https://arxiv.org/abs/2105.09938", + "dataset_path": "CoIR-Retrieval/apps", + "dataset_revision": "f22508f96b7a36c2415181ed8bb76f76e04ae2d5", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2021-05-20", "2021-05-20"), + "task_subtypes": ["Code retrieval"], + "license": "mit", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """@article{hendrycksapps2021, + title={Measuring Coding Challenge Competence With APPS}, + author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +}""", + "modalities": ["text"], + "dialect": [], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="APPS", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBCOVID_QATask.py b/mteb/tasks/RTEB/RTEBCOVID_QATask.py new file mode 100644 index 0000000000..e540d497c3 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBCOVID_QATask.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBCOVID_QA(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBCOVID_QA", + "description": "RTEB evaluation for COVID_QA dataset.", + "reference": "https://aclanthology.org/2020.nlpcovid19-acl.18/", + "dataset_path": "castorini/covid_qa_castorini", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2020-01-01", "2020-12-31"), + "domains": ["Medical"], + "task_subtypes": ["Question answering"], + "license": "apache-2.0", + "annotations_creators": "expert-annotated", + "text_creation": "found", + "bibtex_citation": """@inproceedings{moller-etal-2020-covid, + title = "{COVID}-QA: A Question Answering Dataset for {COVID}-19", + author = "M{\"o}ller, Erik and + Brasch, Malte and + Eger, Steffen and + {\"U}z{\"u}mc{\"u}o{\\u{g}}lu, Hakan and + Reimers, Nils and + Gurevych, Iryna", + booktitle = "Proceedings of the 1st Workshop on NLP for COVID-19 (part 2) at ACL 2020", + month = nov, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.nlpcovid19-acl.18", + doi = "10.18653/v1/2020.nlpcovid19-acl.18", + pages = "145--152", + abstract = "We present COVID-QA, a Question Answering dataset consisting of 2,019 question/answer pairs annotated by volunteer biomedical experts on scientific articles about COVID-19. The dataset is designed to be challenging for current QA systems, as it requires reasoning over multiple sentences and paragraphs. We provide baseline results using several state-of-the-art QA models and analyze their performance.", +}""", + "modalities": ["text"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="COVID_QA", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBChatDoctor_HealthCareMagicTask.py b/mteb/tasks/RTEB/RTEBChatDoctor_HealthCareMagicTask.py new file mode 100644 index 0000000000..b2ae5e0ca3 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBChatDoctor_HealthCareMagicTask.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBChatDoctor_HealthCareMagic(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBChatDoctor_HealthCareMagic", + "description": "RTEB evaluation for ChatDoctor_HealthCareMagic dataset.", + "reference": "https://github.com/Kent0n-Li/ChatDoctor", + "dataset_path": "lavita/ChatDoctor-HealthCareMagic-100k", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2023-06-24", "2023-06-24"), + "task_subtypes": [], + "license": "cc-by-4.0", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """@article{Li2023ChatDoctor, + author = {Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You}, + title = {ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge}, + journal = {Cureus}, + year = {2023}, + volume = {15}, + number = {6}, + pages = {e40895}, + doi = {10.7759/cureus.40895} +}""", + "modalities": ["text"], + "dialect": [], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__( + rteb_dataset_name="ChatDoctor_HealthCareMagic", + **kwargs, + ) diff --git a/mteb/tasks/RTEB/RTEBConvFinQATask.py b/mteb/tasks/RTEB/RTEBConvFinQATask.py new file mode 100644 index 0000000000..cfb335bb00 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBConvFinQATask.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBConvFinQA(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBConvFinQA", + "description": "RTEB evaluation for ConvFinQA dataset.", + "reference": "https://github.com/czyssrs/ConvFinQA", + "dataset_path": "FinGPT/fingpt-convfinqa", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2022-10-07", "2022-10-07"), + "task_subtypes": ["Question answering"], + "license": "mit", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """@article{chen2022convfinqa, + title={ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering}, + author={Chen, Zhiyu and Chen, Wenhu and Wang, Chuhan and Zhang, Xinyi and Zhang, Yuchi and Smrz, Pavel and Yu, Xiangyu and Fung, Pascale}, + journal={arXiv preprint arXiv:2210.03849}, + year={2022} +}""", + "modalities": ["text"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="ConvFinQA", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBDS1000Task.py b/mteb/tasks/RTEB/RTEBDS1000Task.py new file mode 100644 index 0000000000..8aec3b09ad --- /dev/null +++ b/mteb/tasks/RTEB/RTEBDS1000Task.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBDS1000(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBDS1000", + "description": "RTEB evaluation for DS1000 dataset.", + "reference": "https://ds1000-code-gen.github.io/", + "dataset_path": "xlangai/DS-1000", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2022-11-18", "2022-11-18"), + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "cc-by-sa-4.0", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@article{luo2022ds, + title={DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation}, + author={Luo, Zhoujun and Wang, Chong and Wang, Shangqing and Xia, Han and Zhang, Yuyao and Yu, Shujie and Yin, Hailian and Li, Shi Han and Lai, Binyuan and Chen, Xuanlin and others}, + journal={arXiv preprint arXiv:2211.11501}, + year={2022} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn", "python-Code"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="DS1000", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBDialogsumGermanTask.py b/mteb/tasks/RTEB/RTEBDialogsumGermanTask.py new file mode 100644 index 0000000000..bd523857c6 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBDialogsumGermanTask.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBDialogsumGerman(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBDialogsumGerman", + "description": "RTEB evaluation for DialogsumGerman dataset.", + "reference": "https://aclanthology.org/2021.findings-acl.449/", + "dataset_path": "fathyshalab/Dialogsum-german", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2021-05-01", "2021-05-31"), + "domains": ["Spoken"], + "task_subtypes": ["Conversational retrieval"], + "license": "not specified", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@inproceedings{chen-etal-2021-dialogsum, + title = "{D}ialog{S}um: A Real-Life Scenario Dialogue Summarization Dataset", + author = "Chen, Yulong and + Liu, Chong and + Chen, Xin and + Zhao, Hao and + Liu, Tianyu and + Li, Leyang and + Rui, Ruyi and + Zhou, Dandan and + Wang, Chen and + Li, Xiang and + Sun, Zheng and + Yan, Xiaoyu and + Wang, Xixin and + Gao, Xin and + Yan, Xiang and + Huang, Xiaofei and + Yan, Huajian and + Wang, Xinsong", + booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.findings-acl.449", + doi = "10.18653/v1/2021.findings-acl.449", + pages = "5062--5074", + abstract = "Dialogue summarization is a challenging task that requires understanding the context and generating a concise summary of a conversation. Existing datasets for dialogue summarization are limited in size and diversity, which hinders the development of robust models. In this paper, we propose DialogSum, a large-scale dialogue summarization dataset consisting of 13,460 dialogues with corresponding manually labeled summaries and topics. We collect dialogues from various real-life scenarios, including customer service, online forums, and daily conversations. We also provide a detailed analysis of the dataset and baseline results using state-of-the-art models. Experimental results show that DialogSum is a challenging dataset and provides a valuable resource for future research on dialogue summarization.", +}""", + "modalities": ["text"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="DialogsumGerman", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBFiQAPersonalFinanceTask.py b/mteb/tasks/RTEB/RTEBFiQAPersonalFinanceTask.py new file mode 100644 index 0000000000..aec6aac24c --- /dev/null +++ b/mteb/tasks/RTEB/RTEBFiQAPersonalFinanceTask.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBFiQAPersonalFinance(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBFiQAPersonalFinance", + "description": "RTEB evaluation for FiQAPersonalFinance dataset.", + "reference": "https://sites.google.com/view/fiqa/home", + "dataset_path": "bilalRahib/fiqa-personal-finance-dataset", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2018-01-01", "2018-12-31"), + "domains": ["Financial"], + "task_subtypes": ["Question answering"], + "license": "not specified", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@inproceedings{fiqa_2018, + title = {{FiQA-2018} Shared Task: Financial Opinion Mining and Question Answering}, + author = {Radu Tudor Ionescu and Saif Mohammad and Svetlana Kiritchenko and Smaranda Muresan}, + booktitle = {Proceedings of the {ACL} 2018 Workshop on Building {NLP} Solutions for Under Resourced Languages ({BNSUL})}, + month = jul, + year = {2018}, + address = {Melbourne, Australia}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/W18-3501}, + doi = {10.18653/v1/W18-3501}, + pages = {1--10} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__( + rteb_dataset_name="FiQAPersonalFinance", + **kwargs, + ) diff --git a/mteb/tasks/RTEB/RTEBFinQATask.py b/mteb/tasks/RTEB/RTEBFinQATask.py new file mode 100644 index 0000000000..aecede71de --- /dev/null +++ b/mteb/tasks/RTEB/RTEBFinQATask.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBFinQA(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBFinQA", + "description": "RTEB evaluation for FinQA dataset.", + "reference": "https://finqasite.github.io/", + "dataset_path": "ibm-research/finqa", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # Original dataset had date (2021-09-01) but set to None for consistency + "domains": ["Financial"], + "task_subtypes": ["Question answering"], + "license": "mit", # Standardized license format + "annotations_creators": "expert-annotated", + "text_creation": "found", + "bibtex_citation": """@article{chen2021finqa, + title={FinQA: A Dataset of Numerical Reasoning over Financial Data}, + author={Chen, Wenhu and Chen, Zhiyu and Wang, Chuhan and Zhang, Xinyi and Zhang, Yuchi and Smrz, Pavel and Yu, Xiangyu and Fung, Pascale}, + journal={arXiv preprint arXiv:2109.00122}, + year={2021} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="FinQA", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBFinanceBenchTask.py b/mteb/tasks/RTEB/RTEBFinanceBenchTask.py new file mode 100644 index 0000000000..4b924d068e --- /dev/null +++ b/mteb/tasks/RTEB/RTEBFinanceBenchTask.py @@ -0,0 +1,42 @@ +# Concrete RTEB task definition for FinanceBench +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBFinanceBench(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBFinanceBench", + "description": "RTEB evaluation for FinanceBench dataset.", + "reference": "https://github.com/patronus-ai/financebench", + "dataset_path": "PatronusAI/financebench", + "dataset_revision": "main", # Assuming main based on HF page + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2023-11-20", "2023-11-20"), # Using the date of the arXiv paper + "domains": ["Financial"], # Based on dataset type + "task_subtypes": ["Question answering"], + "license": "not specified", # TODO: Verify license + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@misc{islam2023financebench, + title={FinanceBench: A New Benchmark for Financial Question Answering}, + author={Pranab Islam and Anand Kannappan and Douwe Kiela and Rebecca Qian and Nino Scherrer and Bertie Vidgen}, + year={2023}, + eprint={2311.11944}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + # Allow configuration via environment variable or default to the original path + super().__init__(rteb_dataset_name="FinanceBench", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBFrenchBoolQTask.py b/mteb/tasks/RTEB/RTEBFrenchBoolQTask.py new file mode 100644 index 0000000000..6ce465842b --- /dev/null +++ b/mteb/tasks/RTEB/RTEBFrenchBoolQTask.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBFrenchBoolQ(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBFrenchBoolQ", + "description": "RTEB evaluation for FrenchBoolQ dataset.", + "reference": "https://github.com/google-research-datasets/boolean-questions", + "dataset_path": "manu/french_boolq", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2019-01-01", "2019-12-31"), + "domains": ["Spoken"], + "task_subtypes": ["Question answering"], + "license": "not specified", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@article{clark2019boolq, + title={BoolQ: Exploring the surprising difficulty of natural Yes/No questions}, + author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina}, + journal={arXiv preprint arXiv:1905.10441}, + year={2019} +}""", + "modalities": ["text"], + "eval_langs": ["fra-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="FrenchBoolQ", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBFrenchOpenFiscalTextsTask.py b/mteb/tasks/RTEB/RTEBFrenchOpenFiscalTextsTask.py new file mode 100644 index 0000000000..ef36d10c5d --- /dev/null +++ b/mteb/tasks/RTEB/RTEBFrenchOpenFiscalTextsTask.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBFrenchOpenFiscalTexts(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBFrenchOpenFiscalTexts", + "description": "RTEB evaluation for FrenchOpenFiscalTexts dataset.", + "reference": "https://echanges.dila.gouv.fr/OPENDATA/JADE/", # OPENDATA/JADE source + "dataset_path": "StanBienaives/french-open-fiscal-texts", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ( + "2000-01-01", + "2023-12-31", + ), # Assuming a broad date range for case law data + "domains": ["Legal", "Financial"], + "task_subtypes": ["Article retrieval"], + "license": "not specified", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """unknown""", + "modalities": ["text"], + "eval_langs": ["fra-Latn"], + "dialect": [], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__( + rteb_dataset_name="FrenchOpenFiscalTexts", + **kwargs, + ) diff --git a/mteb/tasks/RTEB/RTEBFrenchTriviaQAWikicontextTask.py b/mteb/tasks/RTEB/RTEBFrenchTriviaQAWikicontextTask.py new file mode 100644 index 0000000000..62dcec4119 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBFrenchTriviaQAWikicontextTask.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBFrenchTriviaQAWikicontext(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBFrenchTriviaQAWikicontext", + "description": "RTEB evaluation for FrenchTriviaQAWikicontext dataset.", + "reference": "https://www.cs.utexas.edu/~eunsol/files/papers/acl17jcwz.pdf", + "dataset_path": "manu/french-trivia", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2017-01-01", "2017-12-31"), + "domains": ["Spoken"], + "task_subtypes": ["Question answering"], + "license": "not specified", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@article{joshi2017triviaqa, + title={TriviaQA: A large scale distantly supervised challenge dataset for reading comprehension}, + author={Joshi, Mandar and Choi, Eunsol and Weld, Daniel S and Zettlemoyer, Luke}, + journal={arXiv preprint arXiv:1705.03565}, + year={2017} +}""", + "modalities": ["text"], + "eval_langs": ["fra-Latn"], + "dialect": [], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__( + rteb_dataset_name="FrenchTriviaQAWikicontext", + **kwargs, + ) diff --git a/mteb/tasks/RTEB/RTEBGermanLegalSentencesTask.py b/mteb/tasks/RTEB/RTEBGermanLegalSentencesTask.py new file mode 100644 index 0000000000..97c3e6465a --- /dev/null +++ b/mteb/tasks/RTEB/RTEBGermanLegalSentencesTask.py @@ -0,0 +1,34 @@ +# Concrete RTEB task definition for GermanLegalSentences +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBGermanLegalSentences(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBGermanLegalSentences", + "description": "RTEB evaluation for GermanLegalSentences dataset.", + "reference": "http://openlegaldata.io/", # Open Legal Data source + "dataset_path": "lavis-nlp/german_legal_sentences", + "dataset_revision": "main", + "eval_langs": ["deu-Latn"], + "main_score": "ndcg_at_10", + "domains": ["Legal"], + "task_subtypes": ["Article retrieval"], + "license": "not specified", # TODO: Verify license + "annotations_creators": "LM-generated", + "text_creation": "found", + "bibtex_citation": """unknown""", # TODO: Add bibtex citation + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__( + rteb_dataset_name="GermanLegalSentences", + **kwargs, + ) diff --git a/mteb/tasks/RTEB/RTEBGithubTask.py b/mteb/tasks/RTEB/RTEBGithubTask.py new file mode 100644 index 0000000000..a1b6d75873 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBGithubTask.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBGithub(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBGithub", + "description": "RTEB evaluation for Github dataset.", + "reference": "https://github.com/CoIR-team/coir", + "dataset_path": "CoIR-team/Github", # Updated from TODO placeholder + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2024-07-03", "2024-07-03"), + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "apache-2.0", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """@misc{li2024coircomprehensivebenchmarkcode, + title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, + author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Hao Zhang and Xinyi Dai and Yasheng Wang and Ruiming Tang}, + year={2024}, + eprint={2407.02883}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2407.02883} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn", "python-Code"], + "dialect": [], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="Github", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBHC3FinanceTask.py b/mteb/tasks/RTEB/RTEBHC3FinanceTask.py new file mode 100644 index 0000000000..81e0ede8b5 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBHC3FinanceTask.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBHC3Finance(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBHC3Finance", + "description": "RTEB evaluation for HC3Finance dataset.", + "reference": "https://huggingface.co/datasets/Hello-SimpleAI/HC3", + "dataset_path": "Atharva07/hc3_finance", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # Original dataset had date range (2023-01-01 to 2023-12-31) but set to None for consistency + "domains": ["Financial"], + "task_subtypes": ["Question answering"], + "license": "not specified", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@article{guo2023towards, + title={Towards a Human-ChatGPT Comparative Corpus on Question Answering}, + author={Guo, Jiaxin and Fan, Kai and Su, Xin and Gao, Jundong and Ji, Shuo and Zhou, Yuquan and Wu, Xuejie and Wang, Cong}, + journal={arXiv preprint arXiv:2301.13867}, + year={2023} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="HC3Finance", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBHealthCareGermanTask.py b/mteb/tasks/RTEB/RTEBHealthCareGermanTask.py new file mode 100644 index 0000000000..6bf709500f --- /dev/null +++ b/mteb/tasks/RTEB/RTEBHealthCareGermanTask.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBHealthCareGerman(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBHealthCareGerman", + "description": "RTEB evaluation for HealthCareGerman dataset.", + "reference": "https://huggingface.co/datasets/thisserand/health_care_german", + "dataset_path": "thisserand/health_care_german", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, + "domains": ["Medical"], + "task_subtypes": ["Question answering"], + "license": "not specified", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """unknown""", + "modalities": ["text"], + "eval_langs": ["deu-Latn"], + "dialect": [], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__( + rteb_dataset_name="HealthCareGerman", + **kwargs, + ) diff --git a/mteb/tasks/RTEB/RTEBHumanEvalTask.py b/mteb/tasks/RTEB/RTEBHumanEvalTask.py new file mode 100644 index 0000000000..d80bd57514 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBHumanEvalTask.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBHumanEval(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBHumanEval", + "description": "RTEB evaluation for HumanEval dataset.", + "reference": "https://github.com/openai/human-eval", + "dataset_path": "openai/openai_humaneval", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": ("2021-01-01", "2021-12-31"), + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "mit", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@article{chen2021evaluating, + title={Evaluating large language models trained on code}, + author={Chen, Mark and Tworek, Jerry and Jun, Heewoo and Schoelkopf, Qinyuan and Le, Shi Yusong and Stevens, Foster and Ray, Aditya and Puri, Vijay and Agarwal, Rishabh and Fernandez, Lazar and others}, + journal={arXiv preprint arXiv:2107.03374}, + year={2021} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn", "python-Code"], + "dialect": [], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="HumanEval", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBJapanLawTask.py b/mteb/tasks/RTEB/RTEBJapanLawTask.py new file mode 100644 index 0000000000..9529689d28 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBJapanLawTask.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBJapanLaw(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBJapanLaw", + "description": "RTEB evaluation for JapanLaw dataset.", + "reference": "https://huggingface.co/datasets/y2lan/japan-law", + "dataset_path": "TODO/JapanLaw", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, + "domains": ["Legal"], + "task_subtypes": ["Article retrieval"], + "license": "mit", + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """unknown""", + "modalities": ["text"], + "eval_langs": ["jpn-Jpan"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="JapanLaw", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBJapaneseCoNaLaTask.py b/mteb/tasks/RTEB/RTEBJapaneseCoNaLaTask.py new file mode 100644 index 0000000000..6389ee3d93 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBJapaneseCoNaLaTask.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBJapaneseCoNaLa(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBJapaneseCoNaLa", + "description": "RTEB evaluation for JapaneseCoNaLa dataset.", + "reference": "https://huggingface.co/datasets/haih2/japanese-conala", + "dataset_path": "haih2/japanese-conala", + "dataset_revision": "main", # Assuming main based on HF page + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "not specified", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """unknown""", + "modalities": ["text"], + "eval_langs": [ + "jpn-Jpan", + "python-Code", + ], # Including python-Code as it's a code generation dataset + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="JapaneseCoNaLa", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBLegalQuADTask.py b/mteb/tasks/RTEB/RTEBLegalQuADTask.py new file mode 100644 index 0000000000..e66f246d8a --- /dev/null +++ b/mteb/tasks/RTEB/RTEBLegalQuADTask.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBLegalQuAD(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBLegalQuAD", + "description": "RTEB evaluation for LegalQuAD dataset.", + "reference": "https://github.com/elenanereiss/LegalQuAD", + "dataset_path": "elenanereiss/LegalQuAD", # Updated from local path to HF path + "dataset_revision": "dd73c838031a4914a7a1a16d785b8cec617aaaa4", + "main_score": "ndcg_at_10", + "revision": "1.0.0", + "date": None, # LegalQuAD doesn't have a specific date range + "domains": ["Legal"], + "task_subtypes": ["Question answering"], + "license": "cc-by-nc-sa-4.0", # Standardized license format + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """@inproceedings{reiss-etal-2021-legalquad, + title={LegalQuAD: A Dataset for Legal Question Answering over Documents}, + author={Reiss, Elena and Wohlfarth, Maximilian and Wirth, Christian and Biemann, Chris}, + booktitle={Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics}, + year={2021}, + organization={ACL} +}""", + "modalities": ["text"], + "eval_langs": ["deu-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="LegalQuAD", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBLegalSummarizationTask.py b/mteb/tasks/RTEB/RTEBLegalSummarizationTask.py new file mode 100644 index 0000000000..eff89fec56 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBLegalSummarizationTask.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBLegalSummarization(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBLegalSummarization", + "description": "RTEB evaluation for LegalSummarization dataset.", + "reference": "https://huggingface.co/datasets/mteb/legal_summarization", + "dataset_path": "mteb/legal_summarization", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # No specific date range available + "domains": ["Legal"], + "task_subtypes": ["Article retrieval"], + "license": "cc-by-sa-4.0", # Standardized license format + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """unknown""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__( + rteb_dataset_name="LegalSummarization", + **kwargs, + ) diff --git a/mteb/tasks/RTEB/RTEBMBPPTask.py b/mteb/tasks/RTEB/RTEBMBPPTask.py new file mode 100644 index 0000000000..c371570d6e --- /dev/null +++ b/mteb/tasks/RTEB/RTEBMBPPTask.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBMBPP(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBMBPP", + "description": "RTEB evaluation for MBPP dataset.", + "reference": "https://huggingface.co/datasets/Muennighoff/mbpp", + "dataset_path": "Muennighoff/mbpp", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # MBPP doesn't have a specific date range + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "cc-by-sa-4.0", # Standardized license format + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """@article{appel2022mbpp, + title={MBPP: A Code Generation Benchmark for the Classroom}, + author={Appel, Alexander and Yang, Ke and Yin, Pengcheng and others}, + journal={arXiv preprint arXiv:2208.05317}, + year={2022} +}""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="MBPP", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBTAT_QATask.py b/mteb/tasks/RTEB/RTEBTAT_QATask.py new file mode 100644 index 0000000000..031bc59522 --- /dev/null +++ b/mteb/tasks/RTEB/RTEBTAT_QATask.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBTAT_QA(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBTAT_QA", + "description": "RTEB evaluation for TAT_QA dataset.", + "reference": "https://huggingface.co/datasets/next-tat/TAT-QA", + "dataset_path": "next-tat/TAT-QA", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # TAT-QA doesn't specify a date range + "domains": ["Financial"], + "task_subtypes": ["Question answering"], + "license": "cc-by-sa-4.0", # Standardized license format + "annotations_creators": "human-annotated", + "text_creation": "found", + "bibtex_citation": """unknown""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="TAT_QA", **kwargs) diff --git a/mteb/tasks/RTEB/RTEBWikiSQLTask.py b/mteb/tasks/RTEB/RTEBWikiSQLTask.py new file mode 100644 index 0000000000..f2eb77776e --- /dev/null +++ b/mteb/tasks/RTEB/RTEBWikiSQLTask.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB + +logger = logging.getLogger(__name__) + + +class RTEBWikiSQL(AbsTaskRTEB): + _TASK_SPECIFIC_METADATA = { + "task_name": "RTEBWikiSQL", + "description": "RTEB evaluation for WikiSQL dataset.", + "reference": "https://huggingface.co/datasets/Salesforce/wikisql", + "dataset_path": "Salesforce/wikisql", + "dataset_revision": "main", + "main_score": "ndcg_at_10", + "revision": "1.0.1", + "date": None, # WikiSQL doesn't specify a date range + "domains": ["Programming"], + "task_subtypes": ["Question answering"], + "license": "not specified", + "annotations_creators": "derived", + "text_creation": "found", + "bibtex_citation": """unknown""", + "modalities": ["text"], + "eval_langs": ["eng-Latn"], + } + + metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA) + + def __init__(self, **kwargs): + super().__init__(rteb_dataset_name="WikiSQL", **kwargs) diff --git a/mteb/tasks/RTEB/__init__.py b/mteb/tasks/RTEB/__init__.py new file mode 100644 index 0000000000..fe126f4bd5 --- /dev/null +++ b/mteb/tasks/RTEB/__init__.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from .RTEBAILACasedocsTask import RTEBAILACasedocs as RTEBAILACasedocs +from .RTEBAILAStatutesTask import RTEBAILAStatutes as RTEBAILAStatutes +from .RTEBAPPSTask import RTEBAPPS as RTEBAPPS +from .RTEBChatDoctor_HealthCareMagicTask import ( + RTEBChatDoctor_HealthCareMagic as RTEBChatDoctor_HealthCareMagic, +) +from .RTEBConvFinQATask import RTEBConvFinQA as RTEBConvFinQA +from .RTEBCOVID_QATask import RTEBCOVID_QA as RTEBCOVID_QA +from .RTEBDialogsumGermanTask import RTEBDialogsumGerman as RTEBDialogsumGerman +from .RTEBDS1000Task import RTEBDS1000 as RTEBDS1000 +from .RTEBFinanceBenchTask import RTEBFinanceBench as RTEBFinanceBench +from .RTEBFinQATask import RTEBFinQA as RTEBFinQA +from .RTEBFiQAPersonalFinanceTask import ( + RTEBFiQAPersonalFinance as RTEBFiQAPersonalFinance, +) +from .RTEBFrenchBoolQTask import RTEBFrenchBoolQ as RTEBFrenchBoolQ +from .RTEBFrenchOpenFiscalTextsTask import ( + RTEBFrenchOpenFiscalTexts as RTEBFrenchOpenFiscalTexts, +) +from .RTEBFrenchTriviaQAWikicontextTask import ( + RTEBFrenchTriviaQAWikicontext as RTEBFrenchTriviaQAWikicontext, +) +from .RTEBGermanLegalSentencesTask import ( + RTEBGermanLegalSentences as RTEBGermanLegalSentences, +) +from .RTEBGithubTask import RTEBGithub as RTEBGithub +from .RTEBHC3FinanceTask import RTEBHC3Finance as RTEBHC3Finance +from .RTEBHealthCareGermanTask import RTEBHealthCareGerman as RTEBHealthCareGerman +from .RTEBHumanEvalTask import RTEBHumanEval as RTEBHumanEval +from .RTEBJapaneseCoNaLaTask import RTEBJapaneseCoNaLa as RTEBJapaneseCoNaLa +from .RTEBJapanLawTask import RTEBJapanLaw as RTEBJapanLaw +from .RTEBLegalQuADTask import RTEBLegalQuAD as RTEBLegalQuAD +from .RTEBLegalSummarizationTask import RTEBLegalSummarization as RTEBLegalSummarization +from .RTEBMBPPTask import RTEBMBPP as RTEBMBPP +from .RTEBTAT_QATask import RTEBTAT_QA as RTEBTAT_QA +from .RTEBWikiSQLTask import RTEBWikiSQL as RTEBWikiSQL diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 8abdf1f811..e1add7655f 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -17,6 +17,7 @@ from .PairClassification import * from .Reranking import * from .Retrieval import * +from .RTEB import * from .SpeedTask import * from .STS import * from .Summarization import * diff --git a/mteb/tasks/aggregated_tasks/RTEBAggregatedTask.py b/mteb/tasks/aggregated_tasks/RTEBAggregatedTask.py new file mode 100644 index 0000000000..63d2ec3742 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/RTEBAggregatedTask.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.RTEB.RTEBAILACasedocsTask import RTEBAILACasedocs +from mteb.tasks.RTEB.RTEBAILAStatutesTask import RTEBAILAStatutes +from mteb.tasks.RTEB.RTEBAPPSTask import RTEBAPPS +from mteb.tasks.RTEB.RTEBChatDoctor_HealthCareMagicTask import ( + RTEBChatDoctor_HealthCareMagic, +) +from mteb.tasks.RTEB.RTEBConvFinQATask import RTEBConvFinQA +from mteb.tasks.RTEB.RTEBCOVID_QATask import RTEBCOVID_QA +from mteb.tasks.RTEB.RTEBDialogsumGermanTask import RTEBDialogsumGerman +from mteb.tasks.RTEB.RTEBDS1000Task import RTEBDS1000 +from mteb.tasks.RTEB.RTEBFinanceBenchTask import RTEBFinanceBench +from mteb.tasks.RTEB.RTEBFinQATask import RTEBFinQA +from mteb.tasks.RTEB.RTEBFiQAPersonalFinanceTask import RTEBFiQAPersonalFinance +from mteb.tasks.RTEB.RTEBFrenchBoolQTask import RTEBFrenchBoolQ +from mteb.tasks.RTEB.RTEBFrenchOpenFiscalTextsTask import RTEBFrenchOpenFiscalTexts +from mteb.tasks.RTEB.RTEBFrenchTriviaQAWikicontextTask import ( + RTEBFrenchTriviaQAWikicontext, +) +from mteb.tasks.RTEB.RTEBGermanLegalSentencesTask import RTEBGermanLegalSentences +from mteb.tasks.RTEB.RTEBGithubTask import RTEBGithub +from mteb.tasks.RTEB.RTEBHC3FinanceTask import RTEBHC3Finance +from mteb.tasks.RTEB.RTEBHealthCareGermanTask import RTEBHealthCareGerman +from mteb.tasks.RTEB.RTEBHumanEvalTask import RTEBHumanEval +from mteb.tasks.RTEB.RTEBJapaneseCoNaLaTask import RTEBJapaneseCoNaLa +from mteb.tasks.RTEB.RTEBJapanLawTask import RTEBJapanLaw +from mteb.tasks.RTEB.RTEBLegalQuADTask import RTEBLegalQuAD +from mteb.tasks.RTEB.RTEBLegalSummarizationTask import RTEBLegalSummarization +from mteb.tasks.RTEB.RTEBMBPPTask import RTEBMBPP +from mteb.tasks.RTEB.RTEBTAT_QATask import RTEBTAT_QA +from mteb.tasks.RTEB.RTEBWikiSQLTask import RTEBWikiSQL + +task_list_rteb: list[AbsTask] = [ + RTEBAILACasedocs(), + RTEBAILAStatutes(), + RTEBAPPS(), + RTEBLegalQuAD(), + RTEBChatDoctor_HealthCareMagic(), + RTEBConvFinQA(), + RTEBCOVID_QA(), + RTEBDialogsumGerman(), + RTEBDS1000(), + RTEBFinanceBench(), + RTEBFinQA(), + RTEBFiQAPersonalFinance(), + RTEBFrenchBoolQ(), + RTEBFrenchOpenFiscalTexts(), + RTEBFrenchTriviaQAWikicontext(), + RTEBGermanLegalSentences(), + RTEBGithub(), + RTEBHC3Finance(), + RTEBHealthCareGerman(), + RTEBHumanEval(), + RTEBJapaneseCoNaLa(), + RTEBJapanLaw(), + RTEBLegalSummarization(), + RTEBMBPP(), + RTEBTAT_QA(), + RTEBWikiSQL(), +] + + +class RTEBAggregatedTask(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="RTEBAggregatedTask", + description="Aggregated task for all RTEB tasks", + reference=None, + tasks=task_list_rteb, + main_score="average_score", + type="RTEB", + eval_splits=["test"], + bibtex_citation=None, + ) diff --git a/mteb/tasks/aggregated_tasks/__init__.py b/mteb/tasks/aggregated_tasks/__init__.py index d6ef84d795..60db4fed81 100644 --- a/mteb/tasks/aggregated_tasks/__init__.py +++ b/mteb/tasks/aggregated_tasks/__init__.py @@ -3,6 +3,7 @@ from .CQADupStackNLRetrieval import CQADupstackNLRetrieval from .CQADupStackRetrieval import CQADupstackRetrieval from .CQADupStackRetrievalFa import CQADupstackRetrievalFa +from .RTEBAggregatedTask import RTEBAggregatedTask from .STS17MultilingualVisualSTS import ( STS17MultilingualVisualSTSEng, STS17MultilingualVisualSTSMultilingual, @@ -22,4 +23,5 @@ "STS17MultilingualVisualSTSMultilingual", "STSBenchmarkMultilingualVisualSTSEng", "STSBenchmarkMultilingualVisualSTSMultilingual", + "RTEBAggregatedTask", ]