diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py index bf8c9e505..904884aca 100644 --- a/src/lighteval/metrics/dynamic_metrics.py +++ b/src/lighteval/metrics/dynamic_metrics.py @@ -119,7 +119,7 @@ def multilingual_quasi_f1_score_metric( Returns: F1 score metric. """ - metric_name = f"f1_{language}" + metric_name = f"f1_{language.value}" multilang_normalizer = get_multilingual_normalizer(language) return SampleLevelMetric( @@ -153,7 +153,7 @@ def multilingual_quasi_exact_match_metric( Returns: Exact match metric. """ - metric_name = f"exact_match_{language}_{match_type}" + metric_name = f"exact_match_{language.value}_{match_type}" multilang_normalizer = get_multilingual_normalizer(language) return SampleLevelMetric( metric_name=metric_name, diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index 14af524e6..daf213a06 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -23,20 +23,27 @@ from langcodes import Language as LangCodeLanguage from langcodes import standardize_tag -from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric +from lighteval.metrics.dynamic_metrics import ( + loglikelihood_acc_metric, + multilingual_quasi_exact_match_metric, + multilingual_quasi_f1_score_metric, +) from lighteval.metrics.normalizations import LogProbTokenNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.templates.copa import get_copa_prompt_function from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.qa import get_qa_prompt_function from lighteval.tasks.templates.utils.formulation import ( CFFormulation, HybridFormulation, MCFFormulation, ) -from lighteval.utils.language import Language +from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro +TASKS_TABLE = [] # ------------------------------- NLI Tasks ------------------------------- # # NLI (Natural Language Inference) tasks involve determining the logical relationship # between two given sentences: a premise and a hypothesis. The goal is to classify @@ -322,6 +329,9 @@ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] ] +TASKS_TABLE.extend( + [*xnli_tasks, *xnli2_tasks, *xnli_indic_tasks, *paws_x_tasks, *rcb_tasks, *ocnli_tasks, *cmnli_tasks] +) # ------------------------------- Copa Tasks ------------------------------- # # COPA (Choice of Plausible Alternatives) tasks involve determining the most plausible cause or effect # for a given premise. These tasks test common sense reasoning and causal inference abilities. @@ -343,8 +353,8 @@ }, formulation=formulation, ), - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa", - hf_subset="copa_ext_ar" if language == Language.ARABIC else standardize_tag(language.value), + hf_repo=("OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa"), + hf_subset=("copa_ext_ar" if language == Language.ARABIC else standardize_tag(language.value)), evaluation_splits=["test"], few_shots_split="validation", generation_size=-1, @@ -447,6 +457,7 @@ ] +TASKS_TABLE.extend([*xcopa_tasks, *copa_indic_tasks, *parus_tasks]) # ------------------------------- Hellaswag Tasks ------------------------------- # # Hellaswag is a commonsense reasoning task that requires models to complete a given scenario # with the most plausible ending. It tests the model's ability to understand and reason about @@ -458,7 +469,7 @@ # It evaluates commonsense reasoning abilities across multiple languages. mlmm_hellaswag_tasks = [ LightevalTaskConfig( - name=f"hellaswag_{lang.value}_{formulation.name.lower()}", + name=f"mlmm_hellaswag_{lang.value}_{formulation.name.lower()}", suite=["lighteval"], prompt_function=get_hellaswag_prompt_function( language=lang, @@ -571,7 +582,7 @@ }, formulation=formulation, ), - hf_repo="HuggingFaceFW-Dev/hellaswag_thai", + hf_repo="lighteval/hellaswag_thai", hf_subset="default", evaluation_splits=["validation"], few_shots_split="train", @@ -582,18 +593,525 @@ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] ] -TASKS_TABLE = [ - *xnli_tasks, - *xnli2_tasks, - *xnli_indic_tasks, - *paws_x_tasks, - *rcb_tasks, - *ocnli_tasks, - *cmnli_tasks, - *xcopa_tasks, - *copa_indic_tasks, - *parus_tasks, - *mlmm_hellaswag_tasks, - *hellaswag_tur_tasks, - *hellaswag_tha_tasks, +TASKS_TABLE.extend( + [ + *mlmm_hellaswag_tasks, + *hellaswag_tur_tasks, + *hellaswag_tha_tasks, + ] +) +# ------------------------------- RC Tasks ------------------------------- # +# Reading Comprehension (RC) tasks evaluate a model's ability to understand and extract information from text passages. +# These tasks typically involve answering questions based on given contexts, spanning multiple languages and formats. +# Add RC tasks supporting about 130 unique languages/scripts. + +# SQuAD - like + +# XQuAD: Cross-lingual Question Answering Dataset, extending SQuAD to 11 languages. +# https://arxiv.org/abs/1910.11856 +xquad_tasks = [ + LightevalTaskConfig( + name=f"xquad_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="google/xquad", + hf_subset=f"xquad.{standardize_tag(language.value)}", + evaluation_splits=("validation",), + few_shots_split="validation", + generation_size=400, + stop_sequence=("\n",), + metric=( + multilingual_quasi_exact_match_metric(language, "prefix"), + multilingual_quasi_f1_score_metric(language), + ), + ) + for language in [ + Language.ARABIC, + Language.GERMAN, + Language.GREEK, + Language.ENGLISH, + Language.SPANISH, + Language.HINDI, + Language.ROMANIAN, + Language.RUSSIAN, + Language.THAI, + Language.TURKISH, + Language.VIETNAMESE, + Language.CHINESE, + ] +] + +# ThaiQA: A question answering dataset for the Thai language. +thaiqa_tasks = [ + LightevalTaskConfig( + name=f"thaiqa_{Language.THAI.value}", + prompt_function=get_qa_prompt_function( + Language.THAI, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="lighteval/thaiqa_squad_fixed", + hf_subset="default", + evaluation_splits=("train",), + few_shots_split="validation", + generation_size=400, + stop_sequence=("\n",), + metric=( + multilingual_quasi_exact_match_metric(Language.THAI, "prefix"), + multilingual_quasi_f1_score_metric(Language.THAI), + ), + ) +] + +# SberQuAD: A large-scale Russian reading comprehension dataset. +# https://arxiv.org/abs/1912.09723 +sber_squad_tasks = [ + LightevalTaskConfig( + name=f"sber_squad_{Language.RUSSIAN.value}", + prompt_function=get_qa_prompt_function( + Language.RUSSIAN, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="kuznetsoffandrey/sberquad", + hf_subset="sberquad", + evaluation_splits=("validation",), + few_shots_split="train", + metric=( + multilingual_quasi_exact_match_metric(Language.RUSSIAN, "prefix"), + multilingual_quasi_f1_score_metric(Language.RUSSIAN), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] + +# ARCD: Arabic Reading Comprehension Dataset. +# https://arxiv.org/pdf/1906.05394 +arcd_tasks = [ + LightevalTaskConfig( + name=f"arcd_{Language.ARABIC.value}", + prompt_function=get_qa_prompt_function( + Language.ARABIC, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="hsseinmz/arcd", + hf_subset="plain_text", + evaluation_splits=("train", "validation"), + metric=( + multilingual_quasi_exact_match_metric(Language.ARABIC, "prefix"), + multilingual_quasi_f1_score_metric(Language.ARABIC), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] + +# KenSwQuAD: A question answering dataset for Kenyan Swahili. +# https://arxiv.org/abs/2205.02364 +kenswquad_tasks = [ + LightevalTaskConfig( + name=f"kenswquad_{Language.SWAHILI.value}", + prompt_function=get_qa_prompt_function( + Language.SWAHILI, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [line["answer"]], + }, + ), + suite=("lighteval",), + hf_repo="lighteval/KenSwQuAD", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="validation", + metric=( + multilingual_quasi_exact_match_metric(Language.SWAHILI, "prefix"), + multilingual_quasi_f1_score_metric(Language.SWAHILI), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] + +# ChineseSquad: A reading comprehension dataset for Chinese. +# https://github.com/pluto-junzeng/ChineseSquad +chinese_squad_tasks = [ + LightevalTaskConfig( + name=f"chinese_squad_{Language.CHINESE.value}", + prompt_function=get_qa_prompt_function( + Language.CHINESE, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="lighteval/ChineseSquad", + hf_subset="default", + evaluation_splits=("validation",), + few_shots_split="train", + metric=( + multilingual_quasi_exact_match_metric(Language.CHINESE, "prefix"), + multilingual_quasi_f1_score_metric(Language.CHINESE), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] + +# CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese. +# https://arxiv.org/abs/1810.07366 +cmrc2018_tasks = [ + LightevalTaskConfig( + name=f"cmrc2018_{Language.CHINESE.value}", + prompt_function=get_qa_prompt_function( + Language.CHINESE, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="clue/clue", + hf_subset="cmrc2018", + evaluation_splits=("trial",), + few_shots_split="train", + generation_size=400, + metric=( + multilingual_quasi_exact_match_metric(Language.CHINESE, "prefix"), + multilingual_quasi_f1_score_metric(Language.CHINESE), + ), + stop_sequence=("\n",), + ) +] + +# IndicQA: A reading comprehension dataset for 11 Indian languages. +# https://arxiv.org/abs/2407.13522 +indicqa_tasks = [ + LightevalTaskConfig( + name=f"indicqa_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="ai4bharat/IndicQA", + hf_subset=f"indicqa.{LangCodeLanguage.get(language.value).language}", + # Since we use trust_dataset, we have to be careful about what is inside the dataset + # script. We thus lock the revision to ensure that the script doesn't change + hf_revision="92d96092ae229950973dac3b9998f8b3a8949b0a", + hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), + trust_dataset=True, + evaluation_splits=("test",), + few_shots_split="test", + generation_size=400, + metric=( + multilingual_quasi_exact_match_metric(language, "prefix"), + multilingual_quasi_f1_score_metric(language), + ), + stop_sequence=("\n",), + ) + for language in [ + Language.ASSAMESE, + Language.BENGALI, + Language.GUJARATI, + Language.HINDI, + Language.KANNADA, + Language.MALAYALAM, + Language.MARATHI, + Language.ORIYA, + Language.PUNJABI, + Language.TAMIL, + Language.TELUGU, + ] +] + +# FQuAD v2: French Question Answering Dataset version 2. +# https://arxiv.org/abs/2002.06071 +fquad_v2_tasks = [ + LightevalTaskConfig( + name=f"fquadv2_{Language.FRENCH.value}", + prompt_function=get_qa_prompt_function( + Language.FRENCH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="manu/fquad2_test", + hf_subset="default", + evaluation_splits=("test_hasAns",), + few_shots_split="valid_hasAns", + generation_size=400, + stop_sequence=("\n",), + metric=( + multilingual_quasi_exact_match_metric(Language.FRENCH, "prefix"), + multilingual_quasi_f1_score_metric(Language.FRENCH), + ), + ) ] + +# TQuAD v2: Turkish Question Answering Dataset version 2. +tquad_v2_tasks = [ + LightevalTaskConfig( + name=f"tquadv2_{Language.TURKISH.value}", + prompt_function=get_qa_prompt_function( + Language.TURKISH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [a["text"] for a in line["answers"]], + }, + ), + suite=("lighteval",), + hf_repo="erdometo/tquad2", + hf_subset="default", + evaluation_splits=("validation",), + few_shots_split="train", + generation_size=400, + stop_sequence=("\n",), + metric=( + multilingual_quasi_exact_match_metric(Language.TURKISH, "prefix"), + multilingual_quasi_f1_score_metric(Language.TURKISH), + ), + ) +] + +# Other QA tasks for RC + +# TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. +# https://arxiv.org/abs/2003.05002 +tydiqa_tasks = [ + LightevalTaskConfig( + name=f"tydiqa_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="google-research-datasets/tydiqa", + hf_subset="secondary_task", + evaluation_splits=("validation",), + few_shots_split="train", + generation_size=400, + stop_sequence=("\n",), + metric=( + multilingual_quasi_exact_match_metric(language, "prefix"), + multilingual_quasi_f1_score_metric(language), + ), + ) + for language in [ + Language.ENGLISH, + Language.ARABIC, + Language.BENGALI, + Language.FINNISH, + Language.INDONESIAN, + Language.JAPANESE, + Language.KOREAN, + Language.SWAHILI, + Language.RUSSIAN, + Language.TELUGU, + Language.THAI, + ] +] + +# Other MCF tasks for RC + +# Belebele: A large-scale reading comprehension dataset covering 122 languages. +# https://arxiv.org/abs/2308.16884 +belebele_tasks = [ + LightevalTaskConfig( + name=f"belebele_{language}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + iso_639_3_ind_to_iso_639_3_macro[LangCodeLanguage.get(language).to_alpha3()], + lambda line: { + "question": line["question"], + "context": line["flores_passage"], + "choices": [line[f"mc_answer{i}"] for i in range(1, 5)], + "gold_idx": int(line["correct_answer_num"]) - 1, + }, + ), + suite=("lighteval",), + hf_repo="facebook/belebele", + hf_subset=language, + evaluation_splits=("test",), + metric=[ + loglikelihood_acc_metric(normalization=LogProbTokenNorm()), + ], + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] + for language in [ + "acm_Arab", + "arz_Arab", + "ceb_Latn", + "fin_Latn", + "hin_Deva", + "ita_Latn", + "khm_Khmr", + "lvs_Latn", + "npi_Deva", + "pol_Latn", + "slv_Latn", + "swe_Latn", + # "tso_Latn", + # "xho_Latn", + "afr_Latn", + "asm_Beng", + "ces_Latn", + "fra_Latn", + "hin_Latn", + "jav_Latn", + # "kin_Latn", + "mal_Mlym", + "npi_Latn", + "por_Latn", + # "sna_Latn", + "swh_Latn", + "tur_Latn", + "yor_Latn", + "als_Latn", + "azj_Latn", + "ckb_Arab", + # "fuv_Latn", + "hrv_Latn", + "jpn_Jpan", + "kir_Cyrl", + "mar_Deva", + # "nso_Latn", + "snd_Arab", + "tam_Taml", + "ukr_Cyrl", + "zho_Hans", + "amh_Ethi", + # "bam_Latn", + "dan_Latn", + # "gaz_Latn", + "hun_Latn", + # "kac_Latn", + "kor_Hang", + "mkd_Cyrl", + # "nya_Latn", + "ron_Latn", + "som_Latn", + "tel_Telu", + "urd_Arab", + "zho_Hant", + "apc_Arab", + "ben_Beng", + "deu_Latn", + # "grn_Latn", + "hye_Armn", + "kan_Knda", + "lao_Laoo", + "mlt_Latn", + "ory_Orya", + "rus_Cyrl", + # "sot_Latn", + "tgk_Cyrl", + "urd_Latn", + "zsm_Latn", + "arb_Arab", + "ben_Latn", + "ell_Grek", + "guj_Gujr", + # "ibo_Latn", + "kat_Geor", + # "lin_Latn", + # "mri_Latn", + "pan_Guru", + # "shn_Mymr", + "spa_Latn", + "tgl_Latn", + "uzn_Latn", + # "zul_Latn", + "arb_Latn", + # "bod_Tibt", + "eng_Latn", + # "hat_Latn", + # "ilo_Latn", + "kaz_Cyrl", + "lit_Latn", + "mya_Mymr", + "pbt_Arab", + "sin_Latn", + "srp_Cyrl", + "tha_Thai", + "vie_Latn", + "ars_Arab", + "bul_Cyrl", + "est_Latn", + # "hau_Latn", + "ind_Latn", + # "kea_Latn", + # "lug_Latn", + "nld_Latn", + "pes_Arab", + "sin_Sinh", + # "ssw_Latn", + # "tir_Ethi", + "war_Latn", + "ary_Arab", + "cat_Latn", + "eus_Latn", + "heb_Hebr", + "isl_Latn", + # "khk_Cyrl", + # "luo_Latn", + "nob_Latn", + "plt_Latn", + "slk_Latn", + # "sun_Latn", + # "tsn_Latn", + # "wol_Latn", + ] +] + +TASKS_TABLE.extend( + [ + *xquad_tasks, + *thaiqa_tasks, + *sber_squad_tasks, + *arcd_tasks, + *kenswquad_tasks, + *chinese_squad_tasks, + *cmrc2018_tasks, + *indicqa_tasks, + *fquad_v2_tasks, + *tquad_v2_tasks, + *tydiqa_tasks, + *belebele_tasks, + ] +) diff --git a/src/lighteval/tasks/templates/qa.py b/src/lighteval/tasks/templates/qa.py new file mode 100644 index 000000000..ca2b5c925 --- /dev/null +++ b/src/lighteval/tasks/templates/qa.py @@ -0,0 +1,78 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from typing import Callable + +from typing_extensions import NotRequired, TypedDict + +from lighteval.tasks.templates.multichoice import MCQInput, create_adapter_from_dict, get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation +from lighteval.utils.language import Language + + +class QAInput(TypedDict): + question: str + choices: list[str] + context: NotRequired[str] + instruction: NotRequired[str] + + +class QAAdapter(TypedDict): + question: str + context: str + context: NotRequired[str] + instruction: NotRequired[str] + + +def get_qa_prompt_function(language: Language, adapter: Callable[[dict], QAInput] | QAAdapter): + """ + Create a templated prompt function for a QA task. + Example tasks: + - XQuAD + - SQuAD + + Format: + Question: xxx + Answer: | Answer + + Args: + language (Language): The language of the QA task. + adapter (Callable[[dict], QAInput] | QAAdapter): A function or dictionary to adapt the input data to the required QAInput format. + Must map data from the dataset row to the QAInput format. + + Returns: + Callable: A function that generates QA prompts based on the given parameters. + """ + + adapter_fn: Callable[[dict], QAInput] = ( + create_adapter_from_dict(adapter) if isinstance(adapter, dict) else adapter # type: ignore + ) + + def adapter_for_mcq(line: dict) -> MCQInput: + input_data = adapter_fn(line) + return { + **input_data, + "gold_idx": list(range(len(input_data["choices"]))), + } + + multichoice_prompt_fn = get_mcq_prompt_function(language, adapter=adapter_for_mcq, formulation=CFFormulation()) + return multichoice_prompt_fn diff --git a/src/lighteval/utils/language.py b/src/lighteval/utils/language.py index 68cf9340c..9474f419e 100644 --- a/src/lighteval/utils/language.py +++ b/src/lighteval/utils/language.py @@ -118,3 +118,137 @@ class Language(Enum): SORANI = "ckb" CEBUANO = "ceb" WAR = "war" + + +# This mapping was created for beleble, it converts iso_639_3 individual codes to iso_639_3 macro codes +# However it requires iso639-lang package and I don't see a point installing it just for this mapping +# Code to generate: +# ```` +# from langcodes import Language +# from iso639 import Lang + +# dst = get_dataset_config_names("facebook/belebele") +# output = {} +# for i in dst: +# lang_old = Lang(Language.get(i).language) +# lang = lang_old.macro() if lang_old.macro() else lang_old +# output[lang_old.pt3] = lang.pt3 +# ``` + +iso_639_3_ind_to_iso_639_3_macro = { + "acm": Language.ARABIC, + "arz": Language.ARABIC, + "ceb": Language.CEBUANO, + "fin": Language.FINNISH, + "hin": Language.HINDI, + "ita": Language.ITALIAN, + "khm": Language.KHMER, + "lvs": Language.LATVIAN, + "npi": Language.NEPALI, + "pol": Language.POLISH, + "slv": Language.SLOVENIAN, + "swe": Language.SWEDISH, + # 'tso': Language.TSONGA, + # 'xho': Language.XHOSA, + "afr": Language.AFRIKAANS, + "asm": Language.ASSAMESE, + "ces": Language.CZECH, + "fra": Language.FRENCH, + "jav": Language.JAVANESE, + # 'kin': Language.KINYARWANDA, + "mal": Language.MALAYALAM, + "por": Language.PORTUGUESE, + # 'sna': Language.SHONA, + "swh": Language.SWAHILI, + "tur": Language.TURKISH, + "yor": Language.YORUBA, + "als": Language.ALBANIAN, + "azj": Language.AZERBAIJANI, + "ckb": Language.KURDISH, + # 'fuv': Language.FULAH, + "hrv": Language.CROATIAN, + "jpn": Language.JAPANESE, + "kir": Language.KIRGHIZ, + "mar": Language.MARATHI, + # 'nso': Language.NORTHERN_SOTHO, + "snd": Language.SINDHI, + "tam": Language.TAMIL, + "ukr": Language.UKRAINIAN, + "zho": Language.CHINESE, + "amh": Language.AMHARIC, + # 'bam': Language.BAMBARA, + "dan": Language.DANISH, + # 'gaz': Language.OROMO, + "hun": Language.HUNGARIAN, + # 'kac': Language.KACHIN, + "kor": Language.KOREAN, + "mkd": Language.MACEDONIAN, + # 'nya': Language.CHICHEWA, + "ron": Language.ROMANIAN, + "som": Language.SOMALI, + "tel": Language.TELUGU, + "urd": Language.URDU, + "apc": Language.ARABIC, + "ben": Language.BENGALI, + "deu": Language.GERMAN, + # 'grn': Language.GUARANI, + "hye": Language.ARMENIAN, + "kan": Language.KANNADA, + "lao": Language.LAO, + "mlt": Language.MALTESE, + "ory": Language.ORIYA, + "rus": Language.RUSSIAN, + # 'sot': Language.SOUTHERN_SOTHO, + "tgk": Language.TAJIK, + "zsm": Language.MALAY, + "arb": Language.ARABIC, + "ell": Language.GREEK, + "guj": Language.GUJARATI, + # 'ibo': Language.IGBO, + "kat": Language.GEORGIAN, + # 'lin': Language.LINGALA, + # 'mri': Language.MAORI, + "pan": Language.PUNJABI, + # 'shn': Language.SHAN, + "spa": Language.SPANISH, + "fil": Language.TAGALOG, + "uzn": Language.UZBEK, + # 'zul': Language.ZULU, + # 'bod': Language.TIBETAN, + "eng": Language.ENGLISH, + # 'hat': Language.HAITIAN, + # 'ilo': Language.ILOCANO, + "kaz": Language.KAZAKH, + "lit": Language.LITHUANIAN, + "mya": Language.BURMESE, + "pbt": Language.PASHTO, + "sin": Language.SINHALA, + "srp": Language.SERBIAN, + "tha": Language.THAI, + "vie": Language.VIETNAMESE, + "ars": Language.ARABIC, + "bul": Language.BULGARIAN, + "est": Language.ESTONIAN, + # 'hau': Language.HAUSA, + "ind": Language.INDONESIAN, + # 'kea': Language.KABUVERDIANU, + # 'lug': Language.GANDA, + "nld": Language.DUTCH, + "pes": Language.PERSIAN, + # 'ssw': Language.SWATI, + # 'tir': Language.TIGRINYA, + "war": Language.WAR, + "ary": Language.ARABIC, + "cat": Language.CATALAN, + "eus": Language.BASQUE, + "heb": Language.HEBREW, + "isl": Language.ICELANDIC, + # 'khk': Language.MONGOLIAN, + # 'luo': Language.LUO, + "nob": Language.NORWEGIAN, + "plt": Language.MALAGASY, + "slk": Language.SLOVAK, + # 'sun': Language.SUNDANESE, + # 'tsn': Language.TSWANA, + # 'wol': Language.WOLOF +}