From 6dc277723447c46381f171b89005a9e388d0c11d Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Wed, 15 Jan 2025 14:32:58 +0000 Subject: [PATCH 1/3] only return 1 model_name per file --- Makefile | 2 +- mteb/models/sentence_transformers_models.py | 1 + scripts/extract_model_names.py | 32 +++++++++++++++++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 02d0ba2478..9729d080ff 100644 --- a/Makefile +++ b/Makefile @@ -41,5 +41,5 @@ build-docs: model-load-test: @echo "--- 🚀 Running model load test ---" pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]" - python scripts/extract_model_names.py $(BASE_BRANCH) + python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt \ No newline at end of file diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 28349d60d9..e3ff1a736e 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -4,6 +4,7 @@ from mteb.model_meta import ModelMeta +# testing this PR paraphrase_langs = [ "ara_Arab", "bul_Cyrl", diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index ba1bc1a8b0..29121e3ecf 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -1,11 +1,15 @@ from __future__ import annotations +import argparse import ast +import logging import sys from pathlib import Path from git import Repo +logging.basicConfig(level=logging.INFO) + def get_changed_files(base_branch="main"): repo_path = Path(__file__).parent.parent @@ -28,7 +32,9 @@ def get_changed_files(base_branch="main"): ] -def extract_model_names(files: list[str]) -> list[str]: +def extract_model_names( + files: list[str], return_one_model_name_per_file=False +) -> list[str]: model_names = [] for file in files: with open(file) as f: @@ -52,17 +58,39 @@ def extract_model_names(files: list[str]) -> list[str]: ) if model_name: model_names.append(model_name) + if return_one_model_name_per_file: + logging.info( + f"Found model name {model_name} in file {file}" + ) + break # NOTE: Only take the first model_name per file to avoid disk out of space issue. return model_names +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--return_one_model_name_per_file", + action="store_true", + default=False, + help="Only return one model name per file.", + ) + return parser.parse_args() + + if __name__ == "__main__": """ Can pass in base branch as an argument. Defaults to 'main'. e.g. python extract_model_names.py mieb """ + + args = parse_args() + base_branch = sys.argv[1] if len(sys.argv) > 1 else "main" changed_files = get_changed_files(base_branch) - model_names = extract_model_names(changed_files) + model_names = extract_model_names( + changed_files, + return_one_model_name_per_file=args.return_one_model_name_per_file, + ) output_file = Path(__file__).parent / "model_names.txt" with output_file.open("w") as f: f.write(" ".join(model_names)) From e157e9b4178cfa5e447098fd3b24eb4f98093f7b Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Wed, 15 Jan 2025 14:53:34 +0000 Subject: [PATCH 2/3] fix args parse --- scripts/extract_model_names.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index 29121e3ecf..6cbaa2c298 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -3,7 +3,6 @@ import argparse import ast import logging -import sys from pathlib import Path from git import Repo @@ -36,6 +35,7 @@ def extract_model_names( files: list[str], return_one_model_name_per_file=False ) -> list[str]: model_names = [] + first_model_found = False for file in files: with open(file) as f: tree = ast.parse(f.read()) @@ -58,16 +58,21 @@ def extract_model_names( ) if model_name: model_names.append(model_name) - if return_one_model_name_per_file: - logging.info( - f"Found model name {model_name} in file {file}" - ) - break # NOTE: Only take the first model_name per file to avoid disk out of space issue. + first_model_found = True + if return_one_model_name_per_file and first_model_found: + logging.info(f"Found model name {model_name} in file {file}") + break # NOTE: Only take the first model_name per file to avoid disk out of space issue. return model_names def parse_args(): parser = argparse.ArgumentParser() + parser.add_argument( + "base_branch", + nargs="?", + default="main", + help="Base branch to compare changes with", + ) parser.add_argument( "--return_one_model_name_per_file", action="store_true", @@ -85,7 +90,7 @@ def parse_args(): args = parse_args() - base_branch = sys.argv[1] if len(sys.argv) > 1 else "main" + base_branch = args.base_branch changed_files = get_changed_files(base_branch) model_names = extract_model_names( changed_files, From 50a93b6c1f0f3ee006decf3cf9daaa4544f726c2 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Wed, 15 Jan 2025 15:03:19 +0000 Subject: [PATCH 3/3] revert test change --- mteb/models/sentence_transformers_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index e3ff1a736e..28349d60d9 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -4,7 +4,6 @@ from mteb.model_meta import ModelMeta -# testing this PR paraphrase_langs = [ "ara_Arab", "bul_Cyrl",