diff --git a/mteb/cli.py b/mteb/cli.py index 3c57cf3f8a..ece20027f9 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -11,7 +11,7 @@ To run a model on a set of tasks, use the `mteb run` command. For example: ```bash -mteb run -m average_word_embeddings_komninos \ +mteb run -m sentence-transformers/average_word_embeddings_komninos \ -t Banking77Classification EmotionClassification \ --output_folder mteb_output \ --verbosity 3 @@ -44,7 +44,7 @@ Once a model is run you can create the metadata for a model card from a folder of results, use the `mteb create_meta` command. For example: ```bash -mteb create_meta --results_folder mteb_output/average_word_embeddings_komninos/{revision} \ +mteb create_meta --results_folder mteb_output/sentence-transformers__average_word_embeddings_komninos/{revision} \ --output_path model_card.md ``` diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 96ef821021..74f4a79f64 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -11,7 +11,7 @@ NotASafetensorsRepoError, SafetensorsParsingError, ) -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, field_validator from mteb.abstasks.AbsTask import AbsTask from mteb.encoder_interface import Encoder @@ -63,7 +63,7 @@ class ModelMeta(BaseModel): Attributes: loader: the function that loads the model. If None it will just default to loading the model using the sentence transformer library. - name: The name of the model, ideally the name on huggingface. + name: The name of the model, ideally the name on huggingface. It should be in the format "organization/model_name". n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the number of parameters is not known (e.g. for proprietary models) or if the loader returns a SentenceTransformer model from which it can be derived. memory_usage_mb: The memory usage of the model in MB. Can be None if the memory usage is not known (e.g. for proprietary models). To calculate it use the `calculate_memory_usage_mb` method. @@ -123,6 +123,17 @@ def to_dict(self): dict_repr["loader"] = get_loader_name(loader) return dict_repr + @field_validator("name") + @classmethod + def check_name(cls, v: str | None) -> str | None: + if v is None or v == "bm25s": + return v + if "/" not in v: + raise ValueError( + "Model name must be in the format 'organization/model_name'" + ) + return v + def load_model(self, **kwargs: Any) -> Encoder: if self.loader is None: logger.warning( diff --git a/mteb/models/b1ade_models.py b/mteb/models/b1ade_models.py index cfc9b5ebb0..fe7bae4d51 100644 --- a/mteb/models/b1ade_models.py +++ b/mteb/models/b1ade_models.py @@ -10,7 +10,7 @@ b1ade_embed = ModelMeta( loader=sentence_transformers_loader, - name="b1ade-embed", + name="w601sxs/b1ade-embed", languages=["eng-Latn"], revision="3bdac13927fdc888b903db93b2ffdbd90b295a69", open_weights=True, diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py index a3856627b1..598eb59882 100644 --- a/mteb/models/cohere_v.py +++ b/mteb/models/cohere_v.py @@ -182,7 +182,7 @@ def get_fused_embeddings( cohere_mult_3 = ModelMeta( loader=partial(cohere_v_loader, model_name="embed-multilingual-v3.0"), - name="embed-multilingual-v3.0-v", + name="Cohere/Cohere-embed-multilingual-v3.0", languages=[], # Unknown, but support >100 languages revision="1", release_date="2024-10-24", @@ -204,7 +204,7 @@ def get_fused_embeddings( cohere_eng_3 = ModelMeta( loader=partial(cohere_v_loader, model_name="embed-english-v3.0"), - name="embed-english-v3.0-v", + name="Cohere/Cohere-embed-english-v3.0", languages=["eng-Latn"], revision="1", release_date="2024-10-24", diff --git a/tests/test_benchmark/mock_models.py b/tests/test_benchmark/mock_models.py index a0d35cf8fa..cd14af0a0a 100644 --- a/tests/test_benchmark/mock_models.py +++ b/tests/test_benchmark/mock_models.py @@ -45,7 +45,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): class MockCLIPEncoder: mteb_model_meta = ModelMeta( - name="MockCLIPModel", + name="mock/MockCLIPModel", languages=["eng_Latn"], revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268", release_date="2021-02-06", @@ -91,7 +91,7 @@ def calculate_probs(self, text_embeddings, image_embeddings): class MockMocoEncoder: mteb_model_meta = ModelMeta( - name="MockMocoModel", + name="mock/MockMocoModel", languages=["eng_Latn"], revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", release_date="2024-01-01", diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index d7357664fe..a1c08cd0fa 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -364,7 +364,7 @@ def test_task_modality_filtering(mock_logger, task): f"'{modality}'" for modality in sorted(task.metadata.modalities) ) mock_logger.assert_called_with( - f"MockMocoModel only supports ['image'], but the task modalities are [{task_modalities}]." + f"mock/MockMocoModel only supports ['image'], but the task modalities are [{task_modalities}]." ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 743e080ba9..c91d47fb83 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,7 +33,7 @@ def test_available_benchmarks(): run_task_fixures = [ ( - "average_word_embeddings_komninos", + "sentence-transformers/average_word_embeddings_komninos", "BornholmBitextMining", "21eec43590414cb8e3a6f654857abed0483ae36e", ), @@ -189,7 +189,7 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: def test_save_predictions(): - command = f"{sys.executable} -m mteb run -m average_word_embeddings_komninos -t NFCorpus --output_folder tests/results --save_predictions" + command = f"{sys.executable} -m mteb run -m sentence-transformers/average_word_embeddings_komninos -t NFCorpus --output_folder tests/results --save_predictions" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" test_folder = Path(__file__).parent diff --git a/tests/test_encoder_interfaces.py b/tests/test_encoder_interfaces.py index 546a41152e..39ade8dffc 100644 --- a/tests/test_encoder_interfaces.py +++ b/tests/test_encoder_interfaces.py @@ -7,12 +7,16 @@ def test_sentence_is_encoder(): - model = SentenceTransformer("average_word_embeddings_komninos") + model = SentenceTransformer( + "sentence-transformers/average_word_embeddings_komninos" + ) assert isinstance(model, Encoder) def test_wrapped_sentence_is_encoder_with_query_corpus_encode(): - model = SentenceTransformer("average_word_embeddings_komninos") + model = SentenceTransformer( + "sentence-transformers/average_word_embeddings_komninos" + ) model = DRESModel(model) assert isinstance(model, Encoder) diff --git a/tests/test_models/test_model_meta.py b/tests/test_models/test_model_meta.py index e772f20824..f0c54dd99c 100644 --- a/tests/test_models/test_model_meta.py +++ b/tests/test_models/test_model_meta.py @@ -36,7 +36,7 @@ def test_model_memory_usage_api_model(): ) def test_model_similar_tasks(training_datasets): dummy_model_meta = ModelMeta( - name="test_model", + name="test/test_model", revision="test", release_date=None, languages=None, @@ -67,6 +67,32 @@ def test_model_similar_tasks(training_datasets): assert sorted(dummy_model_meta.get_training_datasets().keys()) == expected +def test_model_name_without_prefix(): + with pytest.raises(ValueError): + ModelMeta( + name="test_model", + revision="test", + release_date=None, + languages=None, + loader=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=None, + public_training_code=None, + public_training_data=None, + framework=[], + reference=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, + ) + + def test_model_training_dataset_adapted(): model_meta = mteb.get_model_meta("deepvk/USER-bge-m3") assert model_meta.adapted_from == "BAAI/bge-m3" diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index f588d7e18b..4a709d3b83 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -361,7 +361,7 @@ def test_mteb_rerank(tmp_path: Path): def test_reranker_same_ndcg1(): - de_name = "average_word_embeddings_komninos" + de_name = "sentence-transformers/average_word_embeddings_komninos" revision = "21eec43590414cb8e3a6f654857abed0483ae36e" de = SentenceTransformer(de_name, revision=revision) ce = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") @@ -406,7 +406,7 @@ def test_reranker_same_ndcg1(): # read in stage 1 and stage two and check ndcg@1 is the same with open( - f"tests/results/stage1/sentence-transformers__{de_name}/{revision}/SciFact.json" + f"tests/results/stage1/{de_name.replace('/', '__')}/{revision}/SciFact.json" ) as f: stage1 = json.load(f)