Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/create_tasks_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import polars as pl

import mteb
from mteb.abstasks.TaskMetadata import PROGRAMMING_LANGS, TASK_TYPE
from mteb.languages import ISO_TO_FAM_LEVEL0, ISO_TO_LANGUAGE
from mteb.abstasks.TaskMetadata import TASK_TYPE
from mteb.languages import ISO_TO_FAM_LEVEL0, ISO_TO_LANGUAGE, PROGRAMMING_LANGS


def author_from_bibtex(bibtex: str | None) -> str:
Expand Down
50 changes: 3 additions & 47 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@
from ..encoder_interface import PromptType
from ..languages import (
ISO_LANGUAGE_SCRIPT,
ISO_TO_LANGUAGE,
ISO_TO_SCRIPT,
path_to_lang_codes,
path_to_lang_scripts,
check_language_code,
)

TASK_SUBTYPE = Literal[
Expand Down Expand Up @@ -160,23 +157,6 @@
list[ISO_LANGUAGE_SCRIPT], Mapping[HFSubset, list[ISO_LANGUAGE_SCRIPT]]
]

PROGRAMMING_LANGS = [
"python",
"javascript",
"typescript",
"go",
"ruby",
"java",
"php",
"c",
"c++",
"rust",
"swift",
"scala",
"shell",
"sql",
]

METRIC_NAME = str
METRIC_VALUE = Union[int, float, dict[str, Any]]

Expand Down Expand Up @@ -320,34 +300,10 @@ def eval_langs_are_valid(self, eval_langs: LANGUAGES) -> None:
if isinstance(eval_langs, dict):
for langs in eval_langs.values():
for code in langs:
self._check_language_code(code)
check_language_code(code)
else:
for code in eval_langs:
self._check_language_code(code)

@staticmethod
def _check_language_code(code):
"""This method checks that the language code (e.g. "eng-Latn") is valid."""
if "-" not in code:
raise ValueError(
f"Language code should be specified as a BCP-47 language tag (e.g. 'eng-Latn'). Got: {code}"
)
lang, script = code.split("-")
if script == "Code":
if lang in PROGRAMMING_LANGS:
return # override for code
else:
raise ValueError(
f"Programming language {lang} is not a valid programming language."
)
if lang not in ISO_TO_LANGUAGE:
raise ValueError(
f"Invalid language code: {lang}, you can find valid ISO 639-3 codes in {path_to_lang_codes}"
)
if script not in ISO_TO_SCRIPT:
raise ValueError(
f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}"
)
check_language_code(code)

@property
def bcp47_codes(self) -> list[ISO_LANGUAGE_SCRIPT]:
Expand Down
38 changes: 38 additions & 0 deletions mteb/languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,24 @@
path_to_lang_scripts = Path(__file__).parent / "iso_15924_to_script.json"
path_to_lang_fam = Path(__file__).parent / "language_family.json"

PROGRAMMING_LANGS = [
"python",
"javascript",
"typescript",
"go",
"ruby",
"java",
"php",
"c",
"c++",
"c#",
"rust",
"swift",
"scala",
"shell",
"sql",
]

with path_to_lang_codes.open("r") as f:
ISO_TO_LANGUAGE = json.load(f)

Expand Down Expand Up @@ -98,3 +116,23 @@ def contains_scripts(self, scripts: Iterable[str]) -> bool:
if not self.contains_script(s):
return False
return True


def check_language_code(code: str) -> None:
"""This method checks that the language code (e.g. "eng-Latn") is valid."""
lang, script = code.split("-")
if script == "Code":
if lang in PROGRAMMING_LANGS:
return # override for code
else:
raise ValueError(
f"Programming language {lang} is not a valid programming language."
)
if lang not in ISO_TO_LANGUAGE:
raise ValueError(
f"Invalid language code: {lang}, you can find valid ISO 639-3 codes in {path_to_lang_codes}"
)
if script not in ISO_TO_SCRIPT:
raise ValueError(
f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}"
)
15 changes: 14 additions & 1 deletion mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
from mteb.encoder_interface import Encoder

from .custom_validators import LICENSES, MODALITIES, STR_DATE, STR_URL
from .languages import ISO_LANGUAGE_SCRIPT
from .languages import (
ISO_LANGUAGE_SCRIPT,
check_language_code,
)

if TYPE_CHECKING:
from .models.sentence_transformer_wrapper import SentenceTransformerWrapper
Expand Down Expand Up @@ -123,6 +126,16 @@ def to_dict(self):
dict_repr["loader"] = get_loader_name(loader)
return dict_repr

@field_validator("languages")
@classmethod
def languages_are_valid(cls, languages: list[ISO_LANGUAGE_SCRIPT] | None) -> None:
if languages is None:
return None

for code in languages:
check_language_code(code)
return languages

@field_validator("name")
@classmethod
def check_name(cls, v: str | None) -> str | None:
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/align_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def get_fused_embeddings(
model_name="kakaobrain/align-base",
),
name="kakaobrain/align-base",
languages=["eng_Latn"],
languages=["eng-Latn"],
revision="e96a37facc7b1f59090ece82293226b817afd6ba",
release_date="2023-02-24",
modalities=["image", "text"],
Expand Down
160 changes: 80 additions & 80 deletions mteb/models/arctic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,80 +5,80 @@
from mteb.model_meta import ModelMeta, sentence_transformers_loader

LANGUAGES_V2_0 = [
"afr_Latn",
"ara_Arab",
"aze_Latn",
"bel_Cyrl",
"bul_Cyrl",
"ben_Beng",
"cat_Latn",
"ceb_Latn",
"ces_Latn",
"cym_Latn",
"dan_Latn",
"deu_Latn",
"ell_Grek",
"eng_Latn",
"spa_Latn",
"est_Latn",
"eus_Latn",
"fas_Arab",
"fin_Latn",
"fra_Latn",
"glg_Latn",
"guj_Gujr",
"heb_Hebr",
"hin_Deva",
"hrv_Latn",
"hat_Latn",
"hun_Latn",
"hye_Armn",
"ind_Latn",
"isl_Latn",
"ita_Latn",
"jpn_Jpan",
"jav_Latn",
"kat_Geor",
"kaz_Cyrl",
"khm_Khmr",
"kan_Knda",
"kor_Hang",
"kir_Cyrl",
"lao_Laoo",
"lit_Latn",
"lav_Latn",
"mkd_Cyrl",
"mal_Mlym",
"mon_Cyrl",
"mar_Deva",
"msa_Latn",
"mya_Mymr",
"nep_Deva",
"nld_Latn",
"pan_Guru",
"pol_Latn",
"por_Latn",
"que_Latn",
"ron_Latn",
"rus_Cyrl",
"sin_Sinh",
"slk_Latn",
"slv_Latn",
"som_Latn",
"sqi_Latn",
"srp_Cyrl",
"swe_Latn",
"swa_Latn",
"tam_Taml",
"tel_Telu",
"tha_Thai",
"tgl_Latn",
"tur_Latn",
"ukr_Cyrl",
"urd_Arab",
"vie_Latn",
"yor_Latn",
"zho_Hans",
"afr-Latn",
"ara-Arab",
"aze-Latn",
"bel-Cyrl",
"bul-Cyrl",
"ben-Beng",
"cat-Latn",
"ceb-Latn",
"ces-Latn",
"cym-Latn",
"dan-Latn",
"deu-Latn",
"ell-Grek",
"eng-Latn",
"spa-Latn",
"est-Latn",
"eus-Latn",
"fas-Arab",
"fin-Latn",
"fra-Latn",
"glg-Latn",
"guj-Gujr",
"heb-Hebr",
"hin-Deva",
"hrv-Latn",
"hat-Latn",
"hun-Latn",
"hye-Armn",
"ind-Latn",
"isl-Latn",
"ita-Latn",
"jpn-Jpan",
"jav-Latn",
"kat-Geor",
"kaz-Cyrl",
"khm-Khmr",
"kan-Knda",
"kor-Hang",
"kir-Cyrl",
"lao-Laoo",
"lit-Latn",
"lav-Latn",
"mkd-Cyrl",
"mal-Mlym",
"mon-Cyrl",
"mar-Deva",
"msa-Latn",
"mya-Mymr",
"nep-Deva",
"nld-Latn",
"pan-Guru",
"pol-Latn",
"por-Latn",
"que-Latn",
"ron-Latn",
"rus-Cyrl",
"sin-Sinh",
"slk-Latn",
"slv-Latn",
"som-Latn",
"sqi-Latn",
"srp-Cyrl",
"swe-Latn",
"swa-Latn",
"tam-Taml",
"tel-Telu",
"tha-Thai",
"tgl-Latn",
"tur-Latn",
"ukr-Cyrl",
"urd-Arab",
"vie-Latn",
"yor-Latn",
"zho-Hans",
]

arctic_v1_training_datasets = {
Expand Down Expand Up @@ -126,7 +126,7 @@
name="Snowflake/snowflake-arctic-embed-xs",
revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e",
release_date="2024-07-08", # initial commit of hf model.
languages=["eng_Latn"],
languages=["eng-Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=22_600_000,
Expand Down Expand Up @@ -154,7 +154,7 @@
name="Snowflake/snowflake-arctic-embed-s",
revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f",
release_date="2024-04-12", # initial commit of hf model.
languages=["eng_Latn"],
languages=["eng-Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=32_200_000,
Expand Down Expand Up @@ -182,7 +182,7 @@
name="Snowflake/snowflake-arctic-embed-m",
revision="cc17beacbac32366782584c8752220405a0f3f40",
release_date="2024-04-12", # initial commit of hf model.
languages=["eng_Latn"],
languages=["eng-Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
Expand Down Expand Up @@ -210,7 +210,7 @@
name="Snowflake/snowflake-arctic-embed-m-long",
revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1",
release_date="2024-04-12", # initial commit of hf model.
languages=["eng_Latn"],
languages=["eng-Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=137_000_000,
Expand All @@ -237,7 +237,7 @@
name="Snowflake/snowflake-arctic-embed-l",
revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c",
release_date="2024-04-12", # initial commit of hf model.
languages=["eng_Latn"],
languages=["eng-Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=335_000_000,
Expand Down Expand Up @@ -267,7 +267,7 @@
name="Snowflake/snowflake-arctic-embed-m-v1.5",
revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
release_date="2024-07-08", # initial commit of hf model.
languages=["eng_Latn"],
languages=["eng-Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
Expand Down
Loading