embeddings-benchmark · isaac-chung · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py
@@ -404,6 +404,26 @@ def fetch_image(
 
 
 ###
+training_data = {
+    "MSMARCO": ["train"],
+    "NQ": ["train"],
+    "NQHardNegatives": ["train"],
+    "NanoNQRetrieval": ["train"],
+    "NQ-PL": ["train"],  # translation not trained on
+    "HotpotQA": ["train"],
+    "HotpotQA-PL": ["train"],  # translation not trained on
+    "HotpotQAHardNegatives": ["train"],
+    # TriviaQA (Joshi et al., 2017),
+    # SQuAD (Rajpurkar et al., 2016),
+    "FEVER": ["train"],
+    # AllNLI for SimCSE (Gao et al., 2021), selecting a total of 1 million entries.
+    # ImageNet (Deng et al., 2009)
+    # LAION (Schuhmann et al., 2022),
+    # mscoco (Lin et al., 2014),
+    # Docmatix (Laurenc¸on et al., 2024)
+    # synthetic data
+    # M-BEIR (Wei et al., 2024)
+}
 
 
 gme_qwen2vl_2b = ModelMeta(
@@ -416,6 +436,7 @@ def fetch_image(
     open_weights=True,
     revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a",
     release_date="2024-12-24",
+    modalities=["image", "text"],
     n_parameters=2_210_000_000,
     embed_dim=1536,
     license="apache-2.0",
@@ -426,7 +447,7 @@ def fetch_image(
     use_instructions=True,
     public_training_code=None,
     public_training_data=None,
-    training_datasets=None,
+    training_datasets=training_data,
 )
 
 gme_qwen2vl_7b = ModelMeta(
@@ -439,6 +460,7 @@ def fetch_image(
     open_weights=True,
     revision="477027a6480f8630363be77751f169cc3434b673",
     release_date="2024-12-24",
+    modalities=["image", "text"],
     n_parameters=8_290_000_000,
     embed_dim=3584,
     license="apache-2.0",
@@ -449,5 +471,5 @@ def fetch_image(
     use_instructions=True,
     public_training_code=None,
     public_training_data=None,
-    training_datasets=None,
+    training_datasets=training_data,
 )
diff --git a/mteb/models/jina_clip.py b/mteb/models/jina_clip.py
@@ -158,18 +158,25 @@ def encode(  # type: ignore
     revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4",
     release_date="2024-05-30",
     modalities=["image", "text"],
-    n_parameters=None,
-    max_tokens=None,
-    embed_dim=None,
-    license=None,
-    open_weights=None,
+    n_parameters=223_000_000,
+    max_tokens=8192,
+    embed_dim=768,
+    license="apache-2.0",
+    open_weights=True,
     public_training_code=None,
     public_training_data=None,
     framework=["PyTorch"],
-    reference=None,
+    reference="https://huggingface.co/jinaai/jina-clip-v1",
     similarity_fn_name=None,
-    use_instructions=None,
-    training_datasets=None,
+    use_instructions=True,
+    training_datasets={
+        # LAION400M
+        # ShareGPT4V
+        "MSMARCO": ["train"],
+        # NQ
+        # HotpotQA
+        # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
+    },
 )
 
 

diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py
@@ -48,14 +48,26 @@ def extract_model_names(
                             and isinstance(node.value.func, ast.Name)
                             and node.value.func.id == "ModelMeta"
                         ):
-                            model_name = next(
-                                (
-                                    kw.value.value
-                                    for kw in node.value.keywords
-                                    if kw.arg == "name"
-                                ),
-                                None,
-                            )
+                            try:
+                                model_name = next(
+                                    (
+                                        kw.value.value
+                                        for kw in node.value.keywords
+                                        if kw.arg == "name"
+                                    ),
+                                    None,
+                                )
+                            except AttributeError:
+                                # For cases where name is assigned a variable and not a direct string,
+                                # e.g. in gme_v_models.py: `name=HF_GME_QWEN2VL_2B`
+                                model_name = next(
+                                    (
+                                        kw.value.id
+                                        for kw in node.value.keywords
+                                        if kw.arg == "name"
+                                    ),
+                                    None,
+                                )
                             if model_name:
                                 model_names.append(model_name)
                                 first_model_found = True