Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"ColBERT",
]
DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"]
EMBEDDING_DTYPES = Literal["float32", "int8", "binary"]


def sentence_transformers_loader(
Expand All @@ -58,6 +59,34 @@ def get_loader_name(
return loader.__name__


def model_id(
model_name: str,
embd_dtype: str
| None, # Keep None here as input can still be None before default assignment
embd_dim: int | None,
) -> str:
# Handle potential None values passed to the function, even if the class attribute has a default
if model_name is None:
# Or handle appropriately, maybe raise error if name is critical for ID
model_name_part = "unknown_model"
else:
model_name_part = model_name.replace("/", "__")

dtype_str = embd_dtype if embd_dtype else "unknown_dtype"
dim_str = f"{embd_dim}d" if embd_dim else "unknown_dim"

# Check if default was used implicitly for dtype
if embd_dtype is None:
# If the class attribute defaults to 'float32', reflect that possibility if None is passed
# However, the class instance will have 'float32' if not specified.
# Let's assume the function should reflect the actual value passed or derived.
# If the intention is to always use the default if None is passed, adjust logic here.
# For now, stick to representing the input or lack thereof.
pass # dtype_str is already "unknown_dtype"

return f"{model_name_part}_{dtype_str}_{dim_str}"


class ModelMeta(BaseModel):
"""The model metadata object.

Expand All @@ -70,6 +99,7 @@ class ModelMeta(BaseModel):
max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
models).
embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
embd_dtype: The data type of the embeddings produced by the model (e.g., "float32", "int8", "binary"). Defaults to "float32".
revision: The revision number of the model. If None, it is assumed that the metadata (including the loader) is valid for all revisions of the model.
release_date: The date the model's revision was released.
license: The license under which the model is released. Required if open_weights is True.
Expand Down Expand Up @@ -116,6 +146,10 @@ class ModelMeta(BaseModel):
superseded_by: str | None = None
is_cross_encoder: bool | None = None
modalities: list[MODALITIES] = ["text"]
# Attribute merged from rteb/ebr/core/meta.py
embd_dtype: EMBEDDING_DTYPES = (
"float32" # Defaulting to float32 as requested, type hint updated
)

def to_dict(self):
dict_repr = self.model_dump()
Expand Down Expand Up @@ -263,6 +297,15 @@ def calculate_memory_usage_mb(self) -> int | None:
model_memory_mb = model_memory_bytes / MB
return round(model_memory_mb)

@property
def _id(self) -> str:
"""Generates a unique ID for the model based on name, dtype, and dimension."""
if self.name is None:
raise ValueError("Model name is required to generate an ID.")
# Note: Using target's embed_dim and the newly added embd_dtype
# self.embd_dtype will be 'float32' by default if not specified otherwise
return model_id(self.name, self.embd_dtype, self.embed_dim)


def collect_similar_tasks(dataset: str, visited: set[str]) -> set[str]:
"""Recursively collect all similar tasks for a given dataset."""
Expand Down
133 changes: 83 additions & 50 deletions mteb/rteb/ebr/__main__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from __future__ import annotations

import argparse
from collections import defaultdict
import json
import logging
import os
from collections import defaultdict
from pathlib import Path

import pytorch_lightning as pl
from pytorch_lightning.strategies.ddp import DDPStrategy

from ebr.retrieve import run_retrieve_task
from ebr.datasets import DatasetMeta, DATASET_REGISTRY
from ebr.models import ModelMeta, MODEL_REGISTRY
from ebr.core import Encoder, Retriever

from ebr.datasets import DATASET_REGISTRY, DatasetMeta
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these paths will probably need to be updated to conform to the mteb standard.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, definitely - this is why i asked what's the plan. I guessed, there will be a step to make rteb code working ;-)

from ebr.models import MODEL_REGISTRY, ModelMeta
from ebr.retrieve import run_retrieve_task
from pytorch_lightning.strategies.ddp import DDPStrategy

logger = logging.getLogger(__name__)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Expand All @@ -23,53 +23,88 @@ def get_args() -> argparse.Namespace:

# Evaluation
parser.add_argument(
"--gpus", type=int, default=0, help="Number of gpus used for encoding.")
"--gpus", type=int, default=0, help="Number of gpus used for encoding."
)
parser.add_argument(
"--cpus", type=int, default=1, help="Number of cpus used for computation (this is only for models that are not using gpus).")
"--cpus",
type=int,
default=1,
help="Number of cpus used for computation (this is only for models that are not using gpus).",
)
parser.add_argument("--bf16", action="store_true", help="`Use bf16 precision.")
parser.add_argument(
"--bf16", action="store_true", help="`Use bf16 precision.")
"--batch_size", type=int, default=16, help="Batch size for encoding."
)
parser.add_argument(
"--batch_size", type=int, default=16, help="Batch size for encoding.")
"--embd_batch_size",
type=int,
default=1024,
help="Batch size for computing similarity of embeddings.",
)
parser.add_argument(
"--embd_batch_size", type=int, default=1024, help="Batch size for computing similarity of embeddings.")
parser.add_argument(
"--embd_in_memory_threshold", type=int, default=200000,
help="Embeddings will be stored in memory if the amount is below this threshold.")
"--embd_in_memory_threshold",
type=int,
default=200000,
help="Embeddings will be stored in memory if the amount is below this threshold.",
)

# Model
#parser.add_argument(
# parser.add_argument(
# "--model_name", type=str, default=None, help="Model name or path.")
#parser.add_argument(
# parser.add_argument(
# "--embd_dtype", type=str, default="float", help="Embedding type. Options: float32, int8, binary.")
#parser.add_argument(
# parser.add_argument(
# "--embd_dim", type=int, default=None, help="Embedding dimension.")
#parser.add_argument(
# parser.add_argument(
# "--max_length", type=int, default=None, help="Maximum length of model input.")

# Data
parser.add_argument(
"--data_path", type=str, default="data/", help="Path of the dataset, must be specified for custom tasks.")
"--data_path",
type=str,
default="data/",
help="Path of the dataset, must be specified for custom tasks.",
)
parser.add_argument(
"--task_name", type=str, default=None, help="Name of the task. Can be multiple tasks splitted by `,`.")
"--task_name",
type=str,
default=None,
help="Name of the task. Can be multiple tasks splitted by `,`.",
)
parser.add_argument(
"--data_type", default="eval", choices=["eval", "train", "chunk", "merge"], help="Dataset type.")
"--data_type",
default="eval",
choices=["eval", "train", "chunk", "merge"],
help="Dataset type.",
)
parser.add_argument(
"--num_workers", type=int, default=4, help="Number of workers for dataloader.")

"--num_workers", type=int, default=4, help="Number of workers for dataloader."
)

# Output
parser.add_argument(
"--save_path", type=str, default="output/", help="Path to save the output.")
"--save_path", type=str, default="output/", help="Path to save the output."
)
parser.add_argument(
"--save_embds", action="store_true", help="Whether to save the embeddings.")
"--save_embds", action="store_true", help="Whether to save the embeddings."
)
parser.add_argument(
"--load_embds", action="store_true", help="Whether to load the computed embeddings.")
"--load_embds",
action="store_true",
help="Whether to load the computed embeddings.",
)
parser.add_argument(
"--save_prediction", action="store_true", help="Whether to save the predictions.")
"--save_prediction",
action="store_true",
help="Whether to save the predictions.",
)
parser.add_argument(
"--topk", type=int, default=100, help="Number of top documents per query.")
"--topk", type=int, default=100, help="Number of top documents per query."
)
parser.add_argument(
"--overwrite", action="store_true", help="Whether to overwrite the results.")

"--overwrite", action="store_true", help="Whether to overwrite the results."
)

args = parser.parse_args()
return args

Expand All @@ -82,6 +117,7 @@ def _dump_model_meta(
with open(Path(results_dir) / "models.json", "w") as f:
f.write(json.dumps(models, indent=4))


def _dump_dataset_info(
results_dir: str = "results",
dataset_registry: dict[str, DatasetMeta] = DATASET_REGISTRY,
Expand All @@ -94,38 +130,36 @@ def _dump_dataset_info(

groups = []
for (leaderboard, group_name), datasets in group_data.items():
groups.append({"name": group_name, "datasets": datasets, "leaderboard": leaderboard})
groups.append(
{"name": group_name, "datasets": datasets, "leaderboard": leaderboard}
)
with open(Path(results_dir) / "datasets.json", "w") as f:
f.write(json.dumps(groups, indent=4))


def _compile_results(
results_dir: str = "results",
output_dir: str = "output"
):
def _compile_results(results_dir: str = "results", output_dir: str = "output"):
results = []
for dataset_output_dir in Path(output_dir).iterdir():

dataset_results = []
for one_result in dataset_output_dir.iterdir():

eval_file = one_result / "retrieve_eval.json"
if eval_file.exists():
with open(eval_file) as f:
dataset_results.append(json.load(f))

results.append({
**DATASET_REGISTRY[dataset_output_dir.name].model_dump(),
"results": dataset_results,
"is_closed": DATASET_REGISTRY[dataset_output_dir.name].tier != 3
})
results.append(
{
**DATASET_REGISTRY[dataset_output_dir.name].model_dump(),
"results": dataset_results,
"is_closed": DATASET_REGISTRY[dataset_output_dir.name].tier != 3,
}
)

with open(Path(results_dir) / "results.json", "w") as f:
f.write(json.dumps(results, indent=4))


def main(args: argparse.Namespace):

_dump_model_meta()
_dump_dataset_info()

Expand All @@ -148,27 +182,25 @@ def main(args: argparse.Namespace):

# Evaluate each model on the specified datasets
for model_meta in MODEL_REGISTRY.values():

encoder = Encoder(
model_meta.load_model(),
save_embds=args.save_embds,
load_embds=args.load_embds
load_embds=args.load_embds,
)
retriever = Retriever(
topk=args.topk,
similarity=model_meta.similarity,
save_prediction=args.save_prediction
save_prediction=args.save_prediction,
)

eval_results = {}
for dataset_meta in DATASET_REGISTRY.values():

#if trainer.is_global_zero:
# if trainer.is_global_zero:
# trainer.print(f"Evaluating {model_meta.model_name} on {dataset_meta.dataset_name}")

result = run_retrieve_task(dataset_meta, trainer, encoder, retriever, args)
eval_results[dataset_meta.dataset_name] = result

metric = "ndcg_at_10"

# Print the results
Expand All @@ -182,6 +214,7 @@ def main(args: argparse.Namespace):

_compile_results()


if __name__ == "__main__":
args = get_args()
main(args)
3 changes: 1 addition & 2 deletions mteb/rteb/ebr/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from ebr.core.encoder import Encoder
from ebr.core.retriever import Retriever
from __future__ import annotations
3 changes: 1 addition & 2 deletions mteb/rteb/ebr/core/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from ebr.core.base.dataset import RetrievalDataset
from ebr.core.base.model import EmbeddingModel, APIEmbeddingModel
from __future__ import annotations
18 changes: 6 additions & 12 deletions mteb/rteb/ebr/core/base/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,15 @@
from abc import ABC
from functools import cache
from pathlib import Path
from typing import Any, TYPE_CHECKING
from typing import TYPE_CHECKING

from torch.utils.data import Dataset

if TYPE_CHECKING:
from ebr.core.meta import DatasetMeta


def add_instruct(
dataset: Dataset,
instruct: str,
input_type: str
):

def add_instruct(dataset: Dataset, instruct: str, input_type: str):
for item in dataset.data:
if instruct:
item["text"] = instruct + item["text"]
Expand All @@ -26,7 +21,6 @@ def add_instruct(


class RetrievalDataset(ABC):

LEADERBOARD: str = None

def __init__(
Expand All @@ -35,16 +29,16 @@ def __init__(
dataset_meta: DatasetMeta,
query_instruct: str | None = None,
corpus_instruct: str | None = None,
**kwargs
**kwargs,
):
assert type(self).LEADERBOARD, f"leaderboard must be defined"
assert type(self).LEADERBOARD, "leaderboard must be defined"
super().__init__()
self._dataset_meta = dataset_meta
self._query_instruct = query_instruct
self._corpus_instruct = corpus_instruct
self._task_path = (Path(data_path) / dataset_meta.dataset_name).resolve()

#def __getattr__(self, name: str) -> Any:
# def __getattr__(self, name: str) -> Any:
# try:
# return super().__getattr__(name)
# except AttributeError:
Expand Down Expand Up @@ -79,4 +73,4 @@ def relevance(self) -> dict:
def prepare_data(self):
_ = self.corpus
_ = self.queries
_ = self.relevance
_ = self.relevance
Loading