Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 94 additions & 90 deletions mteb/models/siglip_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ def get_fused_embeddings(
return image_embeddings


siglip_training_datasets = {
# WebLI https://arxiv.org/abs/2209.06794
}

siglip_so400m_patch14_224 = ModelMeta(
loader=partial(
SiglipModelWrapper,
Expand All @@ -165,18 +169,18 @@ def get_fused_embeddings(
revision="d04cf29fca7b6374f74d8bea1969314492266b5e",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=877_000_000,
max_tokens=16,
embed_dim=1152,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-so400m-patch14-224",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_so400m_patch14_384 = ModelMeta(
Expand All @@ -189,18 +193,18 @@ def get_fused_embeddings(
revision="9fdffc58afc957d1a03a25b10dba0329ab15c2a3",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=878_000_000,
max_tokens=64,
embed_dim=1152,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-so400m-patch14-384",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_so400m_patch16_256_i18n = ModelMeta(
Expand All @@ -213,18 +217,18 @@ def get_fused_embeddings(
revision="365d321c0cfdea96bc28e3a29787a11a062681a1",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=1_130_000_000,
max_tokens=64,
embed_dim=1152,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-so400m-patch16-256-i18n",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_base_patch16_256_multilingual = ModelMeta(
Expand All @@ -237,18 +241,18 @@ def get_fused_embeddings(
revision="8952a4eafcde3cb7ab46b1dd629b33f8784ca9c6",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=371_000_000,
max_tokens=64,
embed_dim=768,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-base-patch16-256-multilingual",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_base_patch16_256 = ModelMeta(
Expand All @@ -261,18 +265,18 @@ def get_fused_embeddings(
revision="b078df89e446d623010d890864d4207fe6399f61",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=203_000_000,
max_tokens=64,
embed_dim=768,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-base-patch16-256",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_base_patch16_512 = ModelMeta(
Expand All @@ -285,18 +289,18 @@ def get_fused_embeddings(
revision="753a949581523b60257d93e18391e8c27f72eb22",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=204_000_000,
max_tokens=64,
embed_dim=768,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-base-patch16-512",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_base_patch16_384 = ModelMeta(
Expand All @@ -309,18 +313,18 @@ def get_fused_embeddings(
revision="41aec1c83b32e0a6fca20ad88ba058aa5b5ea394",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=203_000_000,
max_tokens=64,
embed_dim=768,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-base-patch16-384",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_base_patch16_224 = ModelMeta(
Expand All @@ -333,18 +337,18 @@ def get_fused_embeddings(
revision="7fd15f0689c79d79e38b1c2e2e2370a7bf2761ed",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=203_000_000,
max_tokens=64,
embed_dim=768,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-base-patch16-224",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_large_patch16_256 = ModelMeta(
Expand All @@ -357,18 +361,18 @@ def get_fused_embeddings(
revision="d0da9f876e7d66b4e250cd2450c3ba2ce735e447",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=652_000_000,
max_tokens=64,
embed_dim=1024,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-large-patch16-256",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

siglip_large_patch16_384 = ModelMeta(
Expand All @@ -381,18 +385,18 @@ def get_fused_embeddings(
revision="ce005573a40965dfd21fd937fbdeeebf2439fc35",
release_date="2024-01-08",
modalities=["image", "text"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
n_parameters=652_000_000,
max_tokens=64,
embed_dim=1024,
license="apache-2.0",
open_weights=True,
public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py",
public_training_data=None,
framework=["PyTorch"],
reference=None,
reference="https://huggingface.co/google/siglip-large-patch16-384",
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
use_instructions=False,
training_datasets=siglip_training_datasets,
)

if __name__ == "__main__":
Expand Down