From faa78defc7158a3890c89eecf6535a943fc2c976 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 14 Feb 2025 09:49:21 +0100 Subject: [PATCH 1/7] fix: generate metadata --- scripts/generate_metadata.py | 61 ++++++++++++++++------- tests/scripts/test_generate_model_meta.py | 47 +++++++++++++++++ 2 files changed, 89 insertions(+), 19 deletions(-) create mode 100644 tests/scripts/test_generate_model_meta.py diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index 4ae87fdbca..2bd4f72e0f 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -168,12 +168,18 @@ def get_max_token(model_name: str) -> int | None: return None +BASE_MODEL_ERRORS = ["tmp/"] + + def get_base_model(model_name: str) -> str | None: try: file_path = hf_hub_download(repo_id=model_name, filename="config.json") with open(file_path) as in_file: config = json.loads(in_file.read()) base_model = config.get("_name_or_path", None) + if base_model in BASE_MODEL_ERRORS: + print(f"Base model error for {model_name} with base model {base_model}") + return None if base_model != model_name: return base_model else: @@ -183,19 +189,32 @@ def get_base_model(model_name: str) -> str | None: return None -def model_meta_from_hf_hub(model_name: str) -> ModelMeta: +def load_model_card(model_name: str) -> dict: + card = ModelCard.load(model_name) + return card.data.to_dict() + + +def get_language_from_card(card_data: dict) -> str | None: + languages = card_data.get("language", None) + if isinstance(languages, str): + languages = [languages] + if languages is not None: + languages = [convert_code(l) for l in languages] + languages = [l for l in languages if l is not None] + return languages + + +def model_meta_from_hf_hub_cross_encoder(model_name: str) -> ModelMeta: + pass + + +def model_meta_from_hf_hub_embedding(model_name: str) -> ModelMeta: try: - card = ModelCard.load(model_name) - card_data = card.data.to_dict() + card_data = load_model_card(model_name) frameworks = ["PyTorch"] if card_data.get("library_name", None) == "sentence-transformers": frameworks.append("Sentence Transformers") - languages = card_data.get("language", None) - if isinstance(languages, str): - languages = [languages] - if languages is not None: - languages = [convert_code(l) for l in languages] - languages = [l for l in languages if l is not None] + languages = get_language_from_card(card_data) repo_info = api.repo_info(model_name) revision = repo_info.sha release_date = repo_info.created_at.strftime("%Y-%m-%d") @@ -223,11 +242,13 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: adapted_from=get_base_model(model_name), training_datasets=training_datasets, open_weights=True, - superseded_by=None, max_tokens=get_max_token(model_name), embed_dim=n_dimensions, similarity_fn_name="cosine", reference=f"https://huggingface.co/{model_name}", + public_training_code=None, + public_training_data=None, + use_instructions=None, ) except Exception as e: warnings.warn(f"Failed to extract metadata from model: {e}.") @@ -241,12 +262,12 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: embed_dim=None, license=None, open_weights=True, - public_training_code=None, - public_training_data=None, similarity_fn_name=None, - use_instructions=None, training_datasets=None, - frameworks=[], + framework=[], + use_instructions=None, + public_training_data=None, + public_training_code=None, ) @@ -256,14 +277,16 @@ def code_from_meta(meta: ModelMeta) -> str: return template.format(variable_name=variable_name, meta=meta.__repr__()) -def main(): - out_path = Path("mteb/models/misc_models.py") +def main(out_path: Path, model_names: list[str] = to_keep): with open(out_path, "w") as out_file: out_file.write("from mteb.model_meta import ModelMeta\n\n") - for model in tqdm(to_keep, desc="Generating metadata for all models."): - meta = model_meta_from_hf_hub(model) + for model_name in tqdm(model_names, desc="Generating metadata for all models."): + meta = model_meta_from_hf_hub_embedding(model_name) + out_file.write(code_from_meta(meta)) if __name__ == "__main__": - main() + out_path = Path("mteb/models/new_tmp.py") + model_names = ["jinaai/jina-reranker-v2-base-multilingual"] + main(out_path, model_names) diff --git a/tests/scripts/test_generate_model_meta.py b/tests/scripts/test_generate_model_meta.py new file mode 100644 index 0000000000..1153baf665 --- /dev/null +++ b/tests/scripts/test_generate_model_meta.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path + +import pytest + +from scripts.generate_metadata import get_base_model, main + + +def test_create_model_meta_embedding_models_from_hf(tmp_path: Path): + models = ["intfloat/multilingual-e5-large", "intfloat/multilingual-e5-small"] + tmp_path = tmp_path / "new_models.py" + main(tmp_path, models) + + assert tmp_path.exists() + assert tmp_path.read_text().startswith("from mteb.model_meta import ModelMeta") + + spec = importlib.util.spec_from_file_location("new_models", tmp_path) + new_models = importlib.util.module_from_spec(spec) + spec.loader.exec_module(new_models) + + assert hasattr(new_models, "intfloat__multilingual_e5_large") + assert hasattr(new_models, "intfloat__multilingual_e5_small") + + assert ( + new_models.intfloat__multilingual_e5_large.name + == "intfloat/multilingual-e5-large" + ) + assert ( + new_models.intfloat__multilingual_e5_small.name + == "intfloat/multilingual-e5-small" + ) + + +def test_get_base_model_name_is_the_same(): + model_name = "jinaai/jina-embeddings-v3" + model = get_base_model(model_name) + assert model is None + + +@pytest.mark.skip(reason="No support for cross-encoder models") +def test_create_model_meta_cross_encoder_models_from_hf(tmp_path: Path): + models = ["intfloat/multilingual-e5-cross-encoder"] + tmp_path = tmp_path / "new_models.py" + main(tmp_path, models) + assert True From 2dde037e613601e4e773fd79131a28cfde02865a Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 14 Feb 2025 10:02:40 +0100 Subject: [PATCH 2/7] use logging not print for script --- scripts/generate_metadata.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index 2bd4f72e0f..7c5c79cf42 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -1,13 +1,12 @@ from __future__ import annotations import json -import warnings from pathlib import Path import iso639 from huggingface_hub import HfApi, ModelCard, hf_hub_download from tqdm import tqdm - +import logging from mteb.model_meta import ModelMeta to_keep = [ @@ -137,7 +136,7 @@ def convert_code(code: str) -> str | None: script = lang_to_script[lang_code] return f"{lang_code}_{script}" except Exception as e: - print(f"Couldn't convert {code}, reason: {e}") + logging.warning(f"Couldn't convert {code}, reason: {e}") return None @@ -153,7 +152,7 @@ def get_embedding_dimensions(model_name: str) -> int | None: pooling_config = json.loads(in_file.read()) return pooling_config.get("word_embedding_dimension", None) except Exception as e: - print(f"Couldn't get embedding size for {model_name}, reason: {e}") + logging.warning(f"Couldn't get embedding size for {model_name}, reason: {e}") return None @@ -164,7 +163,7 @@ def get_max_token(model_name: str) -> int | None: config = json.loads(in_file.read()) return config.get("max_position_embeddings", None) except Exception as e: - print(f"Couldn't get embedding size for {model_name}, reason: {e}") + logging.warning(f"Couldn't get embedding size for {model_name}, reason: {e}") return None @@ -178,14 +177,16 @@ def get_base_model(model_name: str) -> str | None: config = json.loads(in_file.read()) base_model = config.get("_name_or_path", None) if base_model in BASE_MODEL_ERRORS: - print(f"Base model error for {model_name} with base model {base_model}") + logging.warning( + f"Base model error for {model_name} with base model {base_model}" + ) return None if base_model != model_name: return base_model else: return None except Exception as e: - print(f"Couldn't get base model for {model_name}, reason: {e}") + logging.warning(f"Couldn't get base model for {model_name}, reason: {e}") return None @@ -221,7 +222,7 @@ def model_meta_from_hf_hub_embedding(model_name: str) -> ModelMeta: try: n_parameters = repo_info.safetensors.total except Exception as e: - print(f"Couldn't get model size for {model_name}, reason: {e}") + logging.warning(f"Couldn't get model size for {model_name}, reason: {e}") n_parameters = None n_dimensions = get_embedding_dimensions(model_name) datasets = card_data.get("datasets", None) @@ -251,7 +252,7 @@ def model_meta_from_hf_hub_embedding(model_name: str) -> ModelMeta: use_instructions=None, ) except Exception as e: - warnings.warn(f"Failed to extract metadata from model: {e}.") + logging.error(f"Failed to extract metadata from model: {e}.") return ModelMeta( name=model_name, revision=None, From 432331675da29dde703a27795ad7e6c5eeef4086 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 14 Feb 2025 10:05:37 +0100 Subject: [PATCH 3/7] lint --- scripts/generate_metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index 7c5c79cf42..9ffa3fec81 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -1,12 +1,13 @@ from __future__ import annotations import json +import logging from pathlib import Path import iso639 from huggingface_hub import HfApi, ModelCard, hf_hub_download from tqdm import tqdm -import logging + from mteb.model_meta import ModelMeta to_keep = [ From 0e19185f8c6985a6138c8d56cf47babb6dc3bcb3 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 14 Feb 2025 10:07:10 +0100 Subject: [PATCH 4/7] add iso639 to dev pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b291f3f40d..d16376694b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ mteb = "mteb.cli:main" [project.optional-dependencies] dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update -"pytest", "pytest-xdist", "pytest-coverage"] +"pytest", "pytest-xdist", "pytest-coverage", "iso639"] codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] From ec78c3fb01843f27cfc86c8ab4cce1e1a12dbc1f Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 14 Feb 2025 10:36:55 +0100 Subject: [PATCH 5/7] fix import --- tests/scripts/test_generate_model_meta.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/scripts/test_generate_model_meta.py b/tests/scripts/test_generate_model_meta.py index 1153baf665..c930eff497 100644 --- a/tests/scripts/test_generate_model_meta.py +++ b/tests/scripts/test_generate_model_meta.py @@ -5,13 +5,14 @@ import pytest -from scripts.generate_metadata import get_base_model, main +from scripts.generate_metadata import get_base_model +from scripts.generate_metadata import main as generate_metadata_main def test_create_model_meta_embedding_models_from_hf(tmp_path: Path): models = ["intfloat/multilingual-e5-large", "intfloat/multilingual-e5-small"] tmp_path = tmp_path / "new_models.py" - main(tmp_path, models) + generate_metadata_main(tmp_path, models) assert tmp_path.exists() assert tmp_path.read_text().startswith("from mteb.model_meta import ModelMeta") @@ -43,5 +44,5 @@ def test_get_base_model_name_is_the_same(): def test_create_model_meta_cross_encoder_models_from_hf(tmp_path: Path): models = ["intfloat/multilingual-e5-cross-encoder"] tmp_path = tmp_path / "new_models.py" - main(tmp_path, models) + generate_metadata_main(tmp_path, models) assert True From 56a16916748f70f7fc19448c66c6192c9ada1d26 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:29:28 +0100 Subject: [PATCH 6/7] add memory_usage_mb --- scripts/generate_metadata.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index 9ffa3fec81..5c181275e7 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -251,6 +251,7 @@ def model_meta_from_hf_hub_embedding(model_name: str) -> ModelMeta: public_training_code=None, public_training_data=None, use_instructions=None, + memory_usage_mb=None, ) except Exception as e: logging.error(f"Failed to extract metadata from model: {e}.") @@ -270,6 +271,7 @@ def model_meta_from_hf_hub_embedding(model_name: str) -> ModelMeta: use_instructions=None, public_training_data=None, public_training_code=None, + memory_usage_mb=None, ) From fbf3d49a9121ed4e191261c14bc914c6efde3179 Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Fri, 14 Feb 2025 15:14:05 +0100 Subject: [PATCH 7/7] set version for iso639 Co-authored-by: Kenneth Enevoldsen --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d16376694b..2697aa2a75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,9 @@ mteb = "mteb.cli:main" [project.optional-dependencies] dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update -"pytest", "pytest-xdist", "pytest-coverage", "iso639"] +"pytest", "pytest-xdist", "pytest-coverage", +"iso639>=0.1.4" # used for tests/scripts/test_generate_model_meta.py +] codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"]