diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 112c5270a4..e3060dfbbb 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -48,11 +48,11 @@ def filter_tasks( continue if languages is not None: task_languages = task_result.languages - if not any([lang in task_languages for lang in languages]): + if not any(lang in task_languages for lang in languages): continue if domains is not None: task_domains = task_result.domains - if not any([domain in task_domains for domain in domains]): + if not any(domain in task_domains for domain in domains): continue if (task_types is not None) and (task_result.task_type not in task_types): continue @@ -100,7 +100,7 @@ def get_scores( if format == "long": entries = [] for task_res in self.task_results: - entry = dict( + entry = dict( # noqa model_name=self.model_name, model_revision=self.model_revision, task_name=task_res.task_name, @@ -140,7 +140,7 @@ def domains(self) -> list[str]: @property def task_types(self) -> list[str]: - return list(set([task_res.task_type for task_res in self.task_results])) + return list({task_res.task_type for task_res in self.task_results}) @property def task_names(self) -> list[str]: diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index 8ee1f3a2cd..8601420427 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -103,9 +103,6 @@ def load_results( extract the model name and revision from the path. validate_and_filter: If True it will validate that the results object for the task contains the correct splits and filter out splits from the results object that are not default in the task metadata. Defaults to True. - - Returns: - """ repo_directory = download_of_results(results_repo, download_latest=download_latest) model_paths = [p for p in (repo_directory / "results").glob("*") if p.is_dir()] diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 01706d560a..7acb806b81 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -1,11 +1,12 @@ from __future__ import annotations -from datetime import date +import logging from functools import partial -from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal +from typing import TYPE_CHECKING, Any, Callable, Literal -from pydantic import BaseModel, BeforeValidator, TypeAdapter +from pydantic import BaseModel +from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL from mteb.encoder_interface import Encoder from .languages import ISO_LANGUAGE_SCRIPT @@ -13,12 +14,19 @@ if TYPE_CHECKING: from .models.sentence_transformer_wrapper import SentenceTransformerWrapper -Frameworks = Literal["Sentence Transformers", "PyTorch"] +logger = logging.getLogger(__name__) -pastdate_adapter = TypeAdapter(date) -STR_DATE = Annotated[ - str, BeforeValidator(lambda value: str(pastdate_adapter.validate_python(value))) -] # Allows the type to be a string, but ensures that the string is a valid date + +FRAMEWORKS = Literal[ + "Sentence Transformers", + "PyTorch", + "GritLM", + "LLM2Vec", + "TensorFlow", + "API", + "Tevatron", +] +DISTANCE_METRICS = Literal["cosine"] def sentence_transformers_loader( @@ -53,12 +61,19 @@ class ModelMeta(BaseModel): embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings. revision: The revision number of the model. If None it is assumed that the metadata (including the loader) is valid for all revisions of the model. release_date: The date the model's revision was released. - license: The license under which the model is released. Required if open_source is True. - open_source: Whether the model is open source or proprietary. + license: The license under which the model is released. Required if open_weights is True. + open_weights: Whether the model is open source or proprietary. + public_training_data: Whether the training data used to train the model is publicly available. + public_training_code: Whether the code used to train the model is publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. + reference: A URL to the model's page on huggingface or another source. languages: The languages the model is intended for specified as a 3 letter language code followed by a script code e.g. "eng-Latn" for English in the Latin script. + use_instuctions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for + input such as "query: {document}" or "passage: {document}". + zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models + are evaluated non-zero-shot unless specified otherwise. """ name: str | None @@ -71,9 +86,14 @@ class ModelMeta(BaseModel): max_tokens: int | None = None embed_dim: int | None = None license: str | None = None - open_source: bool | None = None - similarity_fn_name: str | None = None - framework: list[Frameworks] = [] + open_weights: bool | None = None + public_training_data: bool | None = None + public_training_code: bool | None = None + framework: list[FRAMEWORKS] = [] + reference: STR_URL | None = None + similarity_fn_name: DISTANCE_METRICS | None = None + use_instuctions: bool | None = None + zero_shot_benchmarks: list[str] | None = None def to_dict(self): dict_repr = self.model_dump() @@ -83,6 +103,9 @@ def to_dict(self): def load_model(self, **kwargs: Any) -> Encoder: if self.loader is None: + logger.warning( + f"Loader not specified for model {self.name}, loading using sentence transformers." + ) loader = partial( sentence_transformers_loader, model_name=self.name, diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 4ec450d5bb..3804aebcd8 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -1,3 +1,24 @@ from __future__ import annotations -from mteb.models.overview import * +import logging + +from mteb.models.overview import ( + MODEL_REGISTRY, + ModelMeta, + get_model, + get_model_meta, + get_model_metas, + model_meta_from_sentence_transformers, +) + +logger = logging.getLogger(__name__) + + +__all__ = [ + "MODEL_REGISTRY", + "ModelMeta", + "get_model", + "get_model_meta", + "get_model_metas", + "model_meta_from_sentence_transformers", +] diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 0ca1f4b5ae..3c350a0ad7 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -4,8 +4,17 @@ arctic_m_v1_5 = ModelMeta( name="Snowflake/snowflake-arctic-embed-m-v1.5", - languages=["eng_Latn"], - open_source=True, revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", release_date="2024-07-08", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=109_000_000, + memory_usage=None, + max_tokens=512, + embed_dim=256, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5", + similarity_fn_name="cosine_similarity", + use_instuctions=False, ) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index e68652b2d2..61200d72e0 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -15,9 +15,18 @@ ), name="BAAI/bge-small-en-v1.5", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", release_date="2023-09-12", # initial commit of hf model. + n_parameters=24_000_000, + memory_usage=None, + embed_dim=512, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-small-en-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) bge_base_en_v1_5 = ModelMeta( @@ -29,9 +38,18 @@ ), name="BAAI/bge-base-en-v1.5", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", release_date="2023-09-11", # initial commit of hf model. + n_parameters=438_000_000, + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-base-en-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) bge_large_en_v1_5 = ModelMeta( @@ -43,7 +61,16 @@ ), name="BAAI/bge-large-en-v1.5", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09", release_date="2023-09-12", # initial commit of hf model. + n_parameters=1_340_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-large-en-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index 8c3f67c312..180929329b 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -117,7 +117,16 @@ def encode(self, texts: list[str], **kwargs): loader=partial(bm25_loader, model_name="bm25s"), # type: ignore name="bm25s", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="0_1_10", release_date="2024-07-10", ## release of version 0.1.10 + n_parameters=None, + memory_usage=None, + embed_dim=None, + license=None, + max_tokens=None, + reference=None, + similarity_fn_name=None, + framework=[], + use_instuctions=False, ) diff --git a/mteb/models/cache_wrapper.py b/mteb/models/cache_wrapper.py index 51e7e77431..61abccb9da 100644 --- a/mteb/models/cache_wrapper.py +++ b/mteb/models/cache_wrapper.py @@ -4,7 +4,7 @@ import json import logging from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any import numpy as np import torch @@ -21,7 +21,7 @@ class TextVectorMap: def __init__( self, - directory: Union[str | Path], + directory: str | Path, initial_vectors: int = 100000, ): self.directory = Path(directory) @@ -29,9 +29,9 @@ def __init__( self.vectors_file = self.directory / "vectors.npy" self.index_file = self.directory / "index.json" self.dimension_file = self.directory / "dimension" - self.hash_to_index: Dict[str, int] = {} - self.vectors: Optional[np.memmap] = None - self.vector_dim: Optional[int] = None + self.hash_to_index: dict[str, int] = {} + self.vectors: np.memmap | None = None + self.vector_dim: int | None = None self.initial_vectors = initial_vectors logger.info(f"Initialized TextVectorMap in directory: {self.directory}") self._initialize_vectors_file() @@ -141,7 +141,7 @@ def save(self) -> None: logger.error(f"Error saving TextVectorMap: {str(e)}") raise - def load(self, name: str = None) -> None: + def load(self, name: str | None = None) -> None: name_details = name if name else "" try: self._load_dimension() @@ -176,7 +176,7 @@ def load(self, name: str = None) -> None: logger.error(f"Error loading TextVectorMap ({name_details}): {str(e)}") raise - def get_vector(self, text: str) -> Optional[np.ndarray]: + def get_vector(self, text: str) -> np.ndarray | None: try: text_hash = self._hash_text(text) if text_hash not in self.hash_to_index: @@ -203,7 +203,7 @@ def close(self): class CachedEmbeddingWrapper(Wrapper, Encoder): - def __init__(self, model: Encoder, cache_path: Union[str | Path]): + def __init__(self, model: Encoder, cache_path: str | Path): self._model = model self.cache_path = Path(cache_path) self.cache_path.mkdir(parents=True, exist_ok=True) @@ -217,7 +217,7 @@ def __init__(self, model: Encoder, cache_path: Union[str | Path]): logger.info("Initialized CachedEmbeddingWrapper") - def encode(self, texts: List[str], batch_size: int = 32, **kwargs) -> np.ndarray: + def encode(self, texts: list[str], batch_size: int = 32, **kwargs) -> np.ndarray: """Encode texts using the wrapped model, with caching""" try: results = [] @@ -282,7 +282,7 @@ def __getattr__(self, name: str) -> Any: f"has attribute '{name}'" ) - def __dir__(self) -> List[str]: + def __dir__(self) -> list[str]: """Return all attributes from both this class and the wrapped model""" return list(set(super().__dir__() + dir(self._model))) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index f55e9465fa..276bc6587c 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -83,7 +83,7 @@ def encode( ), name="embed-multilingual-v3.0", languages=[], # Unknown, but support >100 languages - open_source=False, + open_weights=False, revision="1", release_date="2023-11-02", n_parameters=None, @@ -92,7 +92,8 @@ def encode( embed_dim=1024, license=None, similarity_fn_name="cosine", - framework=[], + framework=["API"], + use_instuctions=False, ) cohere_eng_3 = ModelMeta( @@ -103,7 +104,7 @@ def encode( ), name="embed-english-v3.0", languages=["eng-Latn"], - open_source=False, + open_weights=False, revision="1", release_date="2023-11-02", n_parameters=None, @@ -112,5 +113,6 @@ def encode( embed_dim=1024, license=None, similarity_fn_name="cosine", - framework=[], + framework=["API"], + use_instuctions=False, ) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 967ebd5326..0be991347e 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -63,9 +63,18 @@ def encode( ), name="intfloat/multilingual-e5-large-instruct", languages=XLMR_LANGUAGES, - open_source=True, + open_weights=True, revision="baa7be480a7de1539afce709c8f13f833a510e0a", release_date=E5_PAPER_RELEASE_DATE, + framework=["GritLM", "PyTorch"], + similarity_fn_name="cosine", + use_instuctions=True, + reference="https://huggingface.co/intfloat/multilingual-e5-large-instruct", + n_parameters=560_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=514, ) e5_mistral = ModelMeta( @@ -82,7 +91,16 @@ def encode( ), name="intfloat/e5-mistral-7b-instruct", languages=MISTRAL_LANGUAGES, - open_source=True, + open_weights=True, revision="07163b72af1488142a360786df853f237b1a3ca1", release_date=E5_PAPER_RELEASE_DATE, + framework=["GritLM", "PyTorch"], + similarity_fn_name="cosine", + use_instuctions=True, + reference="https://huggingface.co/intfloat/e5-mistral-7b-instruct", + n_parameters=7_111_000_000, + memory_usage=None, + embed_dim=4096, + license="mit", + max_tokens=32768, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index ffe511e50d..5549c7dd86 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -122,9 +122,18 @@ ), name="intfloat/multilingual-e5-small", languages=XLMR_LANGUAGES, - open_source=True, + open_weights=True, revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", release_date=E5_PAPER_RELEASE_DATE, + n_parameters=118_000_000, + memory_usage=None, + embed_dim=384, + license="mit", + max_tokens=512, + reference="https://huggingface.co/intfloat/multilingual-e5-small", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) e5_mult_base = ModelMeta( @@ -135,9 +144,18 @@ ), name="intfloat/multilingual-e5-base", languages=XLMR_LANGUAGES, - open_source=True, + open_weights=True, revision="d13f1b27baf31030b7fd040960d60d909913633f", release_date=E5_PAPER_RELEASE_DATE, + n_parameters=278_000_000, + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=514, + reference="https://huggingface.co/intfloat/multilingual-e5-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) e5_mult_large = ModelMeta( @@ -149,9 +167,18 @@ ), name="intfloat/multilingual-e5-large", languages=XLMR_LANGUAGES, - open_source=True, + open_weights=True, revision="ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb", release_date=E5_PAPER_RELEASE_DATE, + n_parameters=560_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=514, + reference="https://huggingface.co/intfloat/multilingual-e5-large", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) e5_eng_small_v2 = ModelMeta( @@ -162,9 +189,18 @@ ), name="intfloat/e5-small-v2", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="dca8b1a9dae0d4575df2bf423a5edb485a431236", release_date=E5_PAPER_RELEASE_DATE, + n_parameters=33_000_000, + memory_usage=None, + embed_dim=384, + license="mit", + max_tokens=512, + reference="https://huggingface.co/intfloat/e5-small-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) e5_eng_small = ModelMeta( @@ -176,9 +212,18 @@ ), name="intfloat/e5-small", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="e272f3049e853b47cb5ca3952268c6662abda68f", release_date=E5_PAPER_RELEASE_DATE, + n_parameters=33_000_000, + memory_usage=None, + embed_dim=384, + license="mit", + max_tokens=512, + reference="https://huggingface.co/intfloat/e5-small", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) e5_eng_base_v2 = ModelMeta( @@ -190,9 +235,18 @@ ), name="intfloat/e5-base-v2", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="1c644c92ad3ba1efdad3f1451a637716616a20e8", release_date=E5_PAPER_RELEASE_DATE, + n_parameters=278_000_000, + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=514, + reference="https://huggingface.co/intfloat/e5-base-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) e5_eng_large_v2 = ModelMeta( @@ -204,7 +258,16 @@ ), name="intfloat/e5-large-v2", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="b322e09026e4ea05f42beadf4d661fb4e101d311", release_date=E5_PAPER_RELEASE_DATE, + n_parameters=560_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=514, + reference="https://huggingface.co/intfloat/e5-large-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 1015ca09b9..3ec6f384c0 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -94,7 +94,7 @@ def encode( ), name=name, languages=["eng-Latn"], - open_source=False, + open_weights=False, revision="1", # revision is intended for implementation release_date=None, # couldnt figure this out n_parameters=None, @@ -103,5 +103,6 @@ def encode( embed_dim=768, license=None, similarity_fn_name="cosine", # assumed - framework=[], + framework=["API"], + use_instuctions=True, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 59f1f9ac31..3f5dc173a1 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -59,9 +59,18 @@ def encode( ), name="GritLM/GritLM-7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - open_source=True, + open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", + n_parameters=7_240_000_000, + memory_usage=None, + embed_dim=4096, + license="apache-2.0", + max_tokens=4096, + reference="https://huggingface.co/GritLM/GritLM-7B", + similarity_fn_name="cosine", + framework=["GritLM", "PyTorch"], + use_instuctions=True, ) gritlm8x7b = ModelMeta( loader=partial( @@ -72,7 +81,16 @@ def encode( ), name="GritLM/GritLM-8x7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - open_source=True, + open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", + n_parameters=57_920_000_000, + memory_usage=None, + embed_dim=4096, + license="apache-2.0", + max_tokens=4096, + reference="https://huggingface.co/GritLM/GritLM-8x7B", + similarity_fn_name="cosine", + framework=["GritLM", "PyTorch"], + use_instuctions=True, ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index f90097464a..b6cc9bfb2e 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -61,7 +61,15 @@ def encode( ), name="Alibaba-NLP/gte-Qwen2-7B-instruct", languages=None, - open_source=True, + open_weights=True, revision="e26182b2122f4435e8b3ebecbf363990f409b45b", release_date="2024-06-15", # initial commit of hf model. + n_parameters=7_613_000_000, + memory_usage=None, + embed_dim=3584, + license="apache-2.0", + reference="https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index d7c2aaeced..79f8d7950c 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -92,9 +92,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised", languages=["eng_Latn"], - open_source=True, - revision=None, # TODO: Not sure what to put here as a model is made of two peft repos, each with a different revision + open_weights=True, + revision="baa8ebf04a1c2500e61288e7dad65e8ae42601a7", # TODO: Not sure what to put here as a model is made of two peft repos, each with a different revision release_date="2024-04-09", + n_parameters=7_505_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) llm2vec_llama3_8b_unsupervised = ModelMeta( @@ -107,9 +116,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse", languages=["eng_Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="1cb7b735326d13a8541db8f57f35da5373f5e9c6", release_date="2024-04-09", + n_parameters=7_505_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) @@ -123,9 +141,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", languages=["eng_Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="0ae69bdd5816105778b971c3138e8f8a18eaa3ae", release_date="2024-04-09", + n_parameters=7_111_000_000, + memory_usage=None, + max_tokens=32768, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -138,9 +165,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse", languages=["eng_Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", + n_parameters=7_111_000_000, + memory_usage=None, + max_tokens=32768, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -153,9 +189,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised", languages=["eng_Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", + n_parameters=7_111_000_000, + memory_usage=None, + max_tokens=32768, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -168,9 +213,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse", languages=["eng_Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="a76944871d169ebe7c97eb921764cd063afed785", release_date="2024-04-09", + n_parameters=7_111_000_000, + memory_usage=None, + max_tokens=32768, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) llm2vec_sheared_llama_supervised = ModelMeta( @@ -183,9 +237,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised", languages=["eng_Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", + n_parameters=7_111_000_000, + memory_usage=None, + max_tokens=32768, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -198,7 +261,16 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse", languages=["eng_Latn"], - open_source=True, - revision=None, + open_weights=True, + revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", + n_parameters=7_111_000_000, + memory_usage=None, + max_tokens=32768, + embed_dim=4096, + license="mit", + reference="https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse", + similarity_fn_name="cosine", + framework=["LLM2Vec", "PyTorch"], + use_instuctions=True, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index 2e07130a12..b5451e30ec 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -15,7 +15,16 @@ ), name="mixedbread-ai/mxbai-embed-large-v1", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="990580e27d329c7408b3741ecff85876e128e203", release_date="2024-03-07", # initial commit of hf model. + n_parameters=335_000_000, + memory_usage=None, + max_tokens=512, + embed_dim=1024, + license="apache-2.0", + reference="https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index 00e8b48348..00c9341b3b 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -88,7 +88,7 @@ def encode( # type: ignore ), name="nomic-ai/nomic-embed-text-v1.5", languages=["eng-Latn"], - open_source=True, + open_weights=True, revision="b0753ae76394dd36bcfb912a46018088bca48be0", release_date="2024-02-10", # first commit ) @@ -103,7 +103,16 @@ def encode( # type: ignore ), name="nomic-ai/nomic-embed-text-v1", languages=["eng-Latn"], - open_source=True, + open_weights=True, revision="0759316f275aa0cb93a5b830973843ca66babcf5", release_date="2024-01-31", # first commit + n_parameters=None, + memory_usage=None, + max_tokens=8192, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/nomic-ai/nomic-embed-text-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 06f9ce56c8..d1eaf61644 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -63,7 +63,14 @@ def _to_numpy(self, embedding_response) -> np.ndarray: loader=partial(OpenAIWrapper, model_name="text-embedding-3-small"), max_tokens=8191, embed_dim=1536, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://openai.com/index/new-embedding-models-and-api-updates/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=False, ) text_embedding_3_large = ModelMeta( name="text-embedding-3-large", @@ -73,7 +80,11 @@ def _to_numpy(self, embedding_response) -> np.ndarray: loader=partial(OpenAIWrapper, model_name="text-embedding-3-large"), max_tokens=8191, embed_dim=3072, - open_source=False, + open_weights=False, + framework=["API"], + use_instuctions=False, + n_parameters=None, + memory_usage=None, ) text_embedding_ada_002 = ModelMeta( name="text-embedding-ada-002", @@ -83,5 +94,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: loader=partial(OpenAIWrapper, model_name="text-embedding-ada-002"), max_tokens=8191, embed_dim=1536, - open_source=False, + open_weights=False, + framework=["API"], + use_instuctions=False, + n_parameters=None, + memory_usage=None, ) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index 1def920592..b3ed5ca876 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -52,9 +52,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="samaya-ai/promptriever-llama2-7b-v1", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision release_date="2024-09-15", + n_parameters=7_000_000, + memory_usage=None, + max_tokens=4096, + embed_dim=4096, + license="apache-2.0", + reference="https://huggingface.co/samaya-ai/promptriever-llama2-7b-v1", + similarity_fn_name="cosine", + framework=["PyTorch", "Tevatron"], + use_instuctions=True, ) promptriever_llama3 = ModelMeta( @@ -67,9 +76,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="samaya-ai/promptriever-llama3.1-8b-v1", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="48d6d0fc4e02fb1269b36940650a1b7233035cbb-2ead22cfb1b0e0c519c371c63c2ab90ffc511b8a", # base-peft revision release_date="2024-09-15", + n_parameters=8_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=4096, + license="apache-2.0", + reference="https://huggingface.co/samaya-ai/promptriever-llama3.1-8b-v1", + similarity_fn_name="cosine", + framework=["PyTorch", "Tevatron"], + use_instuctions=True, ) @@ -83,9 +101,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="samaya-ai/promptriever-llama3.1-8b-instruct-v1", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision release_date="2024-09-15", + n_parameters=8_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=4096, + license="apache-2.0", + reference="https://huggingface.co/samaya-ai/promptriever-llama3.1-8b-instruct-v1", + similarity_fn_name="cosine", + framework=["PyTorch", "Tevatron"], + use_instuctions=True, ) promptriever_mistral_v1 = ModelMeta( @@ -98,7 +125,16 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="samaya-ai/promptriever-mistral-v0.1-7b-v1", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision release_date="2024-09-15", + n_parameters=7_000_000, + memory_usage=None, + max_tokens=4096, + embed_dim=4096, + license="apache-2.0", + reference="https://huggingface.co/samaya-ai/promptriever-mistral-v0.1-7b-v1", + similarity_fn_name="cosine", + framework=["PyTorch", "Tevatron"], + use_instuctions=True, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index 47289cf92a..6f2f93f169 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -141,9 +141,18 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="castorini/repllama-v1-7b-lora-passage", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-6097554dfe6e7d93e92f55010b678bcca1e233a8", # base-peft revision release_date="2023-10-11", + n_parameters=7_000_000, + memory_usage=None, + max_tokens=4096, + embed_dim=4096, + license="apache-2.0", + reference="https://huggingface.co/samaya-ai/castorini/repllama-v1-7b-lora-passage", + similarity_fn_name="cosine", + framework=["PyTorch", "Tevatron"], + use_instuctions=True, ) @@ -158,7 +167,16 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="samaya-ai/RepLLaMA-reproduced", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision release_date="2024-09-15", + n_parameters=7_000_000, + memory_usage=None, + max_tokens=4096, + embed_dim=4096, + license="apache-2.0", + reference="https://huggingface.co/samaya-ai/RepLLaMA-reproduced", + similarity_fn_name="cosine", + framework=["PyTorch", "Tevatron"], + use_instuctions=True, ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index d10155a619..1328005a33 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -9,33 +9,69 @@ rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", release_date="2021-10-28", + n_parameters=29_400_000, + memory_usage=None, + embed_dim=312, + license="mit", + max_tokens=2048, + reference="https://huggingface.co/cointegrated/rubert-tiny2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) rubert_tiny = ModelMeta( name="cointegrated/rubert-tiny", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", release_date="2021-05-24", + n_parameters=29_400_000, + memory_usage=None, + embed_dim=312, + license="mit", + max_tokens=2048, + reference="https://huggingface.co/cointegrated/rubert-tiny", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) sbert_large_nlu_ru = ModelMeta( name="ai-forever/sbert_large_nlu_ru", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="af977d5dfa46a3635e29bf0ef383f2df2a08d47a", release_date="2020-11-20", + n_parameters=427_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, # best guess + reference="https://huggingface.co/ai-forever/sbert_large_nlu_ru", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) sbert_large_mt_nlu_ru = ModelMeta( name="ai-forever/sbert_large_mt_nlu_ru", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="05300876c2b83f46d3ddd422a7f17e45cf633bb0", release_date="2021-05-18", + n_parameters=427_000_000, + memory_usage=None, + embed_dim=1024, + license="Not specified", + max_tokens=512, # best guess + reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) user_base_ru = ModelMeta( @@ -47,65 +83,137 @@ ), name="deepvk/USER-base", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", + n_parameters=427_000_000, + memory_usage=None, + embed_dim=1024, + license="Not specified", + max_tokens=512, # best guess + reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) deberta_v1_ru = ModelMeta( name="deepvk/deberta-v1-base", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="bdd30b0e19757e6940c92c7aff19e8fc0a60dff4", release_date="2023-02-07", + n_parameters=124_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/deepvk/deberta-v1-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) rubert_base_cased = ModelMeta( name="DeepPavlov/rubert-base-cased", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="4036cab694767a299f2b9e6492909664d9414229", release_date="2020-03-04", + n_parameters=1280_000_000, + memory_usage=None, + embed_dim=768, + license="Not specified", + max_tokens=512, # best guess + reference="https://huggingface.co/DeepPavlov/rubert-base-cased", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) distilrubert_small_cased_conversational = ModelMeta( name="DeepPavlov/distilrubert-small-cased-conversational", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="e348066b4a7279b97138038299bddc6580a9169a", release_date="2022-06-28", + n_parameters=107_000_000, + memory_usage=None, + embed_dim=768, + license="Not specified", + max_tokens=512, + reference="https://huggingface.co/DeepPavlov/distilrubert-small-cased-conversational", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) rubert_base_cased_sentence = ModelMeta( name="DeepPavlov/rubert-base-cased-sentence", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="78b5122d6365337dd4114281b0d08cd1edbb3bc8", release_date="2020-03-04", + n_parameters=107_000_000, + memory_usage=None, + embed_dim=768, + license="Not specified", + max_tokens=512, + reference="https://huggingface.co/DeepPavlov/rubert-base-cased-sentence", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) labse_en_ru = ModelMeta( name="cointegrated/LaBSE-en-ru", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="cf0714e606d4af551e14ad69a7929cd6b0da7f7e", release_date="2021-06-10", + n_parameters=129_000_000, + memory_usage=None, + embed_dim=768, + license="Not specified", + max_tokens=512, + reference="https://huggingface.co/cointegrated/LaBSE-en-ru", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) rubert_tiny_turbo = ModelMeta( name="sergeyzh/rubert-tiny-turbo", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="8ce0cf757446ce9bb2d5f5a4ac8103c7a1049054", release_date="2024-06-21", + n_parameters=129_000_000, + memory_usage=None, + embed_dim=312, + license="mit", + max_tokens=512, + reference="https://huggingface.co/sergeyzh/rubert-tiny-turbo", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) labse_ru_turbo = ModelMeta( name="sergeyzh/LaBSE-ru-turbo", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="1940b046c6b5e125df11722b899130329d0a46da", release_date="2024-06-27", + n_parameters=129_000_000, + memory_usage=None, + embed_dim=312, + license="mit", + max_tokens=512, + reference="https://huggingface.co/sergeyzh/LaBSE-ru-turbo", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) @@ -123,7 +231,7 @@ ), name="ai-forever/ru-en-RoSBERTa", languages=["rus_Cyrl"], - open_source=True, + open_weights=True, revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", release_date="2024-07-29", ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 0b67c9d688..fe70f597ae 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -62,7 +62,16 @@ def encode( ), name="Salesforce/SFR-Embedding-2_R", languages=["eng_Latn"], - open_source=True, + open_weights=True, revision="91762139d94ed4371a9fa31db5551272e0b83818", release_date="2024-06-14", # initial commit of hf model. + n_parameters=7_110_000_000, + memory_usage=None, + embed_dim=4096, + license="cc-by-nc-4.0", + max_tokens=32768, + reference="https://huggingface.co/Salesforce/SFR-Embedding-2_R", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=True, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index a3603d9eb3..9a33e0f64f 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -63,31 +63,67 @@ all_MiniLM_L6_v2 = ModelMeta( name="sentence-transformers/all-MiniLM-L6-v2", languages=["eng-Latn"], - open_source=True, + open_weights=True, revision="8b3219a92973c328a8e22fadcfa821b5dc75636a", # can be any release_date="2021-08-30", + n_parameters=22_700_000, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", languages=paraphrase_langs, - open_source=True, + open_weights=True, revision="bf3bf13ab40c3157080a7ab344c831b9ad18b5eb", # can be any release_date="2019-11-01", # release date of paper + n_parameters=118_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", languages=paraphrase_langs, - open_source=True, + open_weights=True, revision="79f2382ceacceacdf38563d7c5d16b9ff8d725d6", # can be any release_date="2019-11-01", # release date of paper + n_parameters=278_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) labse = ModelMeta( name="sentence-transformers/LaBSE", languages=paraphrase_langs, - open_source=True, + open_weights=True, revision="e34fab64a3011d2176c99545a93d5cbddc9a91b7", # can be any release_date="2019-11-01", # release date of paper + n_parameters=471_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/LaBSE", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 2845f36e19..4e79f189ee 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -155,7 +155,14 @@ def _batched_encode( ), max_tokens=16000, embed_dim=1024, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://blog.voyageai.com/2024/05/05/voyage-large-2-instruct-instruction-tuned-and-rank-1-on-mteb/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=True, ) voyage_finance_2 = ModelMeta( @@ -170,7 +177,14 @@ def _batched_encode( ), max_tokens=32000, embed_dim=1024, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=False, ) voyage_law_2 = ModelMeta( @@ -185,7 +199,14 @@ def _batched_encode( ), max_tokens=16000, embed_dim=1024, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=False, ) voyage_code_2 = ModelMeta( @@ -200,7 +221,14 @@ def _batched_encode( ), max_tokens=16000, embed_dim=1536, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=False, ) voyage_large_2 = ModelMeta( @@ -215,7 +243,14 @@ def _batched_encode( ), max_tokens=16000, embed_dim=1536, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=False, ) voyage_2 = ModelMeta( @@ -230,9 +265,15 @@ def _batched_encode( ), max_tokens=4000, embed_dim=1024, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=False, ) -# see https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/" voyage_multilingual_2 = ModelMeta( name="voyage-multilingual-2", revision="1", @@ -245,5 +286,12 @@ def _batched_encode( ), max_tokens=32000, embed_dim=1024, - open_source=False, + open_weights=False, + n_parameters=None, + memory_usage=None, + license=None, + reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/", + similarity_fn_name="cosine", + framework=["API"], + use_instuctions=False, ) diff --git a/pyproject.toml b/pyproject.toml index 410e2581a5..5f4f9d5ac7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,7 +120,6 @@ ignore = ["E501", # line too long "D415", # First line should end with a period ] - [tool.ruff.lint.flake8-implicit-str-concat] allow-multiline = false diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 2940fd9593..c540bb41ee 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -370,7 +370,7 @@ def test_reranker_same_ndcg1(): ce.mteb_model_meta = ModelMeta( name="cross-encoder/ms-marco-TinyBERT-L-2-v2", languages=["eng-Latn"], - open_source=True, + open_weights=True, revision=ce_revision, release_date="2021-04-15", )