diff --git a/mteb/models/siglip_models.py b/mteb/models/siglip_models.py index 739b5aa59a..3bca2e508e 100644 --- a/mteb/models/siglip_models.py +++ b/mteb/models/siglip_models.py @@ -155,6 +155,10 @@ def get_fused_embeddings( return image_embeddings +siglip_training_datasets = { + # WebLI https://arxiv.org/abs/2209.06794 +} + siglip_so400m_patch14_224 = ModelMeta( loader=partial( SiglipModelWrapper, @@ -165,18 +169,18 @@ def get_fused_embeddings( revision="d04cf29fca7b6374f74d8bea1969314492266b5e", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=877_000_000, + max_tokens=16, + embed_dim=1152, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-so400m-patch14-224", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_so400m_patch14_384 = ModelMeta( @@ -189,18 +193,18 @@ def get_fused_embeddings( revision="9fdffc58afc957d1a03a25b10dba0329ab15c2a3", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=878_000_000, + max_tokens=64, + embed_dim=1152, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-so400m-patch14-384", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_so400m_patch16_256_i18n = ModelMeta( @@ -213,18 +217,18 @@ def get_fused_embeddings( revision="365d321c0cfdea96bc28e3a29787a11a062681a1", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=1_130_000_000, + max_tokens=64, + embed_dim=1152, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-so400m-patch16-256-i18n", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_base_patch16_256_multilingual = ModelMeta( @@ -237,18 +241,18 @@ def get_fused_embeddings( revision="8952a4eafcde3cb7ab46b1dd629b33f8784ca9c6", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=371_000_000, + max_tokens=64, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-base-patch16-256-multilingual", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_base_patch16_256 = ModelMeta( @@ -261,18 +265,18 @@ def get_fused_embeddings( revision="b078df89e446d623010d890864d4207fe6399f61", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=203_000_000, + max_tokens=64, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-base-patch16-256", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_base_patch16_512 = ModelMeta( @@ -285,18 +289,18 @@ def get_fused_embeddings( revision="753a949581523b60257d93e18391e8c27f72eb22", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=204_000_000, + max_tokens=64, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-base-patch16-512", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_base_patch16_384 = ModelMeta( @@ -309,18 +313,18 @@ def get_fused_embeddings( revision="41aec1c83b32e0a6fca20ad88ba058aa5b5ea394", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=203_000_000, + max_tokens=64, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-base-patch16-384", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_base_patch16_224 = ModelMeta( @@ -333,18 +337,18 @@ def get_fused_embeddings( revision="7fd15f0689c79d79e38b1c2e2e2370a7bf2761ed", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=203_000_000, + max_tokens=64, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-base-patch16-224", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_large_patch16_256 = ModelMeta( @@ -357,18 +361,18 @@ def get_fused_embeddings( revision="d0da9f876e7d66b4e250cd2450c3ba2ce735e447", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=652_000_000, + max_tokens=64, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-large-patch16-256", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) siglip_large_patch16_384 = ModelMeta( @@ -381,18 +385,18 @@ def get_fused_embeddings( revision="ce005573a40965dfd21fd937fbdeeebf2439fc35", release_date="2024-01-08", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + n_parameters=652_000_000, + max_tokens=64, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/google-research/big_vision/blob/main/big_vision/trainers/proj/image_text/siglip.py", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/google/siglip-large-patch16-384", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=siglip_training_datasets, ) if __name__ == "__main__":