diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py index be4f7b207a..14812c0859 100644 --- a/mteb/models/gme_v_models.py +++ b/mteb/models/gme_v_models.py @@ -404,6 +404,26 @@ def fetch_image( ### +training_data = { + "MSMARCO": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # TriviaQA (Joshi et al., 2017), + # SQuAD (Rajpurkar et al., 2016), + "FEVER": ["train"], + # AllNLI for SimCSE (Gao et al., 2021), selecting a total of 1 million entries. + # ImageNet (Deng et al., 2009) + # LAION (Schuhmann et al., 2022), + # mscoco (Lin et al., 2014), + # Docmatix (Laurenc¸on et al., 2024) + # synthetic data + # M-BEIR (Wei et al., 2024) +} gme_qwen2vl_2b = ModelMeta( @@ -416,6 +436,7 @@ def fetch_image( open_weights=True, revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a", release_date="2024-12-24", + modalities=["image", "text"], n_parameters=2_210_000_000, embed_dim=1536, license="apache-2.0", @@ -426,7 +447,7 @@ def fetch_image( use_instructions=True, public_training_code=None, public_training_data=None, - training_datasets=None, + training_datasets=training_data, ) gme_qwen2vl_7b = ModelMeta( @@ -439,6 +460,7 @@ def fetch_image( open_weights=True, revision="477027a6480f8630363be77751f169cc3434b673", release_date="2024-12-24", + modalities=["image", "text"], n_parameters=8_290_000_000, embed_dim=3584, license="apache-2.0", @@ -449,5 +471,5 @@ def fetch_image( use_instructions=True, public_training_code=None, public_training_data=None, - training_datasets=None, + training_datasets=training_data, ) diff --git a/mteb/models/jina_clip.py b/mteb/models/jina_clip.py index bff02a76c3..1f9a597803 100644 --- a/mteb/models/jina_clip.py +++ b/mteb/models/jina_clip.py @@ -158,18 +158,25 @@ def encode( # type: ignore revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4", release_date="2024-05-30", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, + n_parameters=223_000_000, + max_tokens=8192, + embed_dim=768, + license="apache-2.0", + open_weights=True, public_training_code=None, public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/jinaai/jina-clip-v1", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=True, + training_datasets={ + # LAION400M + # ShareGPT4V + "MSMARCO": ["train"], + # NQ + # HotpotQA + # Natural Language Inference (NLI) dataset (Bowman et al., 2015) + }, ) diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index 6cbaa2c298..36cfc572e9 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -48,14 +48,26 @@ def extract_model_names( and isinstance(node.value.func, ast.Name) and node.value.func.id == "ModelMeta" ): - model_name = next( - ( - kw.value.value - for kw in node.value.keywords - if kw.arg == "name" - ), - None, - ) + try: + model_name = next( + ( + kw.value.value + for kw in node.value.keywords + if kw.arg == "name" + ), + None, + ) + except AttributeError: + # For cases where name is assigned a variable and not a direct string, + # e.g. in gme_v_models.py: `name=HF_GME_QWEN2VL_2B` + model_name = next( + ( + kw.value.id + for kw in node.value.keywords + if kw.arg == "name" + ), + None, + ) if model_name: model_names.append(model_name) first_model_found = True