diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index b77887e4d1..c283457273 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -328,6 +328,10 @@ def eval_langs_are_valid(self, eval_langs: LANGUAGES) -> None: @staticmethod def _check_language_code(code): """This method checks that the language code (e.g. "eng-Latn") is valid.""" + if "-" not in code: + raise ValueError( + f"Language code should be specified as a BCP-47 language tag (e.g. 'eng-Latn'). Got: {code}" + ) lang, script = code.split("-") if script == "Code": if lang in PROGRAMMING_LANGS: diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 241ec536ea..75f3095a48 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -5,6 +5,7 @@ import pytest from mteb import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.overview import get_tasks @@ -179,10 +180,23 @@ "TamilNewsClassification", "TenKGnadClusteringP2P.v2", "TenKGnadClusteringS2S.v2", - "SynPerChatbotConvSAClassification", - "CQADupstackRetrieval-Fa", - "VisualSTS17Eng", - "VisualSTS17Multilingual", + "ClimateFEVERHardNegatives", + "DBPediaHardNegatives", + "FEVERHardNegatives", + "HotpotQAHardNegatives", + "MSMARCOHardNegatives", + "NQHardNegatives", + "QuoraRetrievalHardNegatives", + "TopiOCQAHardNegatives", + "MIRACLRetrievalHardNegatives", + "NeuCLIR2022RetrievalHardNegatives", + "NeuCLIR2023RetrievalHardNegatives", + "DBPedia-PLHardNegatives", + "HotpotQA-PLHardNegatives", + "MSMARCO-PLHardNegatives", + "NQ-PLHardNegatives", + "Quora-PLHardNegatives", + "RiaNewsRetrievalHardNegatives", ] @@ -357,23 +371,54 @@ def test_filled_metadata_is_filled(): ) +def test_invalid_metadata_eval_lang_is_invalid(): + with pytest.raises(ValueError): + TaskMetadata( + name="MyTask", + dataset={ + "path": "test/dataset", + "revision": "1.0", + }, + description="testing", + reference="https://aclanthology.org/W19-6138/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng_Latn"], # uses underscore instead of dash + main_score="map", + date=("2021-01-01", "2021-12-31"), + domains=["Non-fiction", "Written"], + license="mit", + task_subtypes=["Thematic clustering"], + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="Someone et al", + ).validate_metadata() + + def test_all_metadata_is_filled_and_valid(): all_tasks = get_tasks() unfilled_metadata = [] + invalid_metadata = [] for task in all_tasks: - if ( - task.metadata.name not in _HISTORIC_DATASETS - and task.metadata.name.replace("HardNegatives", "") - not in _HISTORIC_DATASETS + if task.metadata.name in _HISTORIC_DATASETS or isinstance( + task, AbsTaskAggregate ): - if not task.metadata.is_filled() and ( - not task.metadata.validate_metadata() - ): - unfilled_metadata.append(task.metadata.name) - if unfilled_metadata: + continue + + if not task.metadata.is_filled(): + unfilled_metadata.append(task.metadata.name) + else: + if task.metadata.validate_metadata() is not None: + invalid_metadata.append(task.metadata.name) + + if unfilled_metadata or invalid_metadata: raise ValueError( - f"The metadata of the following datasets is not filled: {unfilled_metadata}" + f"The metadata of the following datasets is not filled: {unfilled_metadata}." + + f"The metadata of the following datasets is invalid: {invalid_metadata}." )