From 060c3ae1f37e8e2b98b2e54d9e1fe2e122fd5cba Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Thu, 10 Oct 2024 01:01:39 +0200 Subject: [PATCH 1/6] add translaiton literals --- src/lighteval/tasks/multilingual/tasks.py | 3 +- src/lighteval/tasks/templates/nli.py | 2 + .../templates/utils/translation_literals.py | 575 ++++++++++++++++-- src/lighteval/utils/language.py | 5 +- 4 files changed, 549 insertions(+), 36 deletions(-) diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index 93ca2b161..5d7b21fa5 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -384,7 +384,8 @@ Language.TURKISH, Language.VIETNAMESE, Language.CHINESE, - # Optionally: Haitian, Quechu + Language.HAITIAN, + Language.QUECHUA, ] for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] ] diff --git a/src/lighteval/tasks/templates/nli.py b/src/lighteval/tasks/templates/nli.py index 5c7abec06..cea3d0f87 100644 --- a/src/lighteval/tasks/templates/nli.py +++ b/src/lighteval/tasks/templates/nli.py @@ -244,6 +244,8 @@ def prompt_fn(line: dict, task_name: str): choices_str = f"{translation_literals.comma}{translation_literals.word_space}".join(rearanged_labales[:-1]) hypothesis = f"{hypothesis.rstrip(PUNCT)}{translation_literals.sentence_space}{choices_str}{translation_literals.word_space}{translation_literals.or_word}{translation_literals.word_space}{rearanged_labales[-1]}{translation_literals.question_mark}" + # (hynky1999): Ideally we would not compute logprobs of the Yes/No/Neither in CF fomulation. However as of right now lighteval doesn't allow to + # use multi-context. row = { "instruction": input_data.get("instruction", ""), "premise": premise, diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 72a7cdc7a..956f0d65f 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -132,6 +132,8 @@ def __getattribute__(self, name: str) -> str: sentence_space=" ", colon=":", ), + # Based on https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mgsm/utils.py + Language.BENGALI: TranslationLiterals(language=Language.BENGALI, question_word="প্রশ্ন"), Language.FRENCH: TranslationLiterals( language=Language.FRENCH, question_word="question", @@ -258,47 +260,554 @@ def __getattribute__(self, name: str) -> str: sentence_space=" ", colon=":", ), - Language.SPANISH: TranslationLiterals(language=Language.SPANISH), - Language.PORTUGUESE: TranslationLiterals(language=Language.PORTUGUESE), - Language.ITALIAN: TranslationLiterals(language=Language.ITALIAN), + Language.SPANISH: TranslationLiterals( + language=Language.SPANISH, + question_word="pregunta", + answer="respuesta", + confirmation_word="cierto", + yes="sì", + no="no", + also="también", + cause_word="porque", + effect_word="por lo tanto", + or_word="o", + true="verdadero", + false="falso", + neither="ninguno", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.PORTUGUESE: TranslationLiterals( + language=Language.PORTUGUESE, + question_word="pergunta", + answer="resposta", + confirmation_word="certo", + yes="sim", + no="não", + also="adicionalmente", + cause_word="porque", + effect_word="logo", + or_word="ou", + true="verdadeiro", + false="falso", + neither="nenhum", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.ITALIAN: TranslationLiterals( + language=Language.ITALIAN, + question_word="domanda", + answer="risposta", + confirmation_word="vero", + yes="sì", + no="no", + also="inoltre", + cause_word="perchè", + effect_word="quindi", + or_word="o", + true="vero", + false="falso", + neither="nessuno dei due", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.GERMAN: TranslationLiterals( + language=Language.GERMAN, + question_word="frage", + answer="antwort", + confirmation_word="richtig", + yes="ja", + no="nein", + also="auch", + cause_word="weil", + effect_word="deshalb", + or_word="oder", + true="wahr", + false="falsch", + neither="weder noch", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.CZECH: TranslationLiterals( + language=Language.CZECH, + question_word="otázka", + answer="odpověď", + confirmation_word="že ano", + yes="ano", + no="ne", + also="navíc", + cause_word="protože", + effect_word="a tedy", + or_word="nebo", + true="pravda", + false="nepravda", + neither="ani jedno", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.DANISH: TranslationLiterals(language=Language.DANISH), + Language.DUTCH: TranslationLiterals( + language=Language.DUTCH, + question_word="vraag", + answer="antwoord", + confirmation_word="toch", + yes="ja", + no="nee", + also="ook", + cause_word="want", + effect_word="dus", + or_word="of", + true="waar", + false="onwaar", + neither="geen van beide", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.ESTONIAN: TranslationLiterals( + language=Language.ESTONIAN, + # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/xcopa/utils.py + cause_word="sest", + effect_word="seetõttu", + ), + Language.FINNISH: TranslationLiterals( + language=Language.FINNISH, + question_word="kysymys", + answer="vastaus", + confirmation_word="eikö niin", + yes="kyllä", + no="ei", + also="myös", + cause_word="koska", + effect_word="siksi", + or_word="tai", + true="totta", + false="tarua", + neither="ei kumpikaan", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.GREEK: TranslationLiterals( + language=Language.GREEK, + question_word="ερώτηση", + answer="απάντηση", + confirmation_word="σωστά", + yes="ναι", + no="όχι", + also="επίσης", + cause_word="επειδή", + effect_word="άρα", + or_word="ή", + true="σωστό", + false="λάθος", + neither="καμία απάντηση", + full_stop=".", + comma=",", + question_mark=";", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon="·", + ), + Language.HUNGARIAN: TranslationLiterals( + language=Language.HUNGARIAN, + question_word="kérdés", + answer="válasz", + confirmation_word="ugye", + yes="igen", + no="nem", + also="is", + cause_word="mert", + effect_word="ezért", + or_word="vagy", + true="igaz", + false="hamis", + neither="egyik sem", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.ICELANDIC: TranslationLiterals(language=Language.ICELANDIC), + Language.INDONESIAN: TranslationLiterals( + language=Language.INDONESIAN, + question_word="pertanyaan", + answer="jawaban", + confirmation_word="kan", + yes="ya", + no="tidak", + also="juga", + cause_word="karena", + effect_word="oleh sebab itu", + or_word="atau", + true="benar", + false="salah", + neither="tidak satu pun", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + # TODO: Add Japanese + Language.JAPANESE: TranslationLiterals( + language=Language.JAPANESE, + question_word="質問", + answer="回答", + confirmation_word="ね", + yes="はい", + no="いいえ", + also="また", + cause_word="なので", + effect_word="なので", + or_word="か/また/あるいは", + true="正解", + false="不正解", + neither="またはどちらでもない", + full_stop="。", + comma="、", + question_mark="?", + exclamation_mark="!", + word_space="", + sentence_space="", + colon=":", + semicolon=";", + ), + Language.KOREAN: TranslationLiterals( + # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/paws-x/_generate_config.py + language=Language.KOREAN, + question_word="맞죠", + yes="예", + no="아니오", + ), + Language.NORWEGIAN: TranslationLiterals( + language=Language.NORWEGIAN, + question_word="spørsmål", + answer="svar", + confirmation_word="ikke sant", + yes="ja", + no="nei", + also="i tillegg", + cause_word="fordi", + effect_word="derfor", + or_word="eller", + true="sant", + false="usant", + neither="ingen av delene", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.POLISH: TranslationLiterals( + language=Language.POLISH, + question_word="pytanie", + answer="odpowiedź", + confirmation_word="prawda", + yes="tak", + no="nie", + also="ponadto", + cause_word="ponieważ", + effect_word="więc", + or_word="lub", + true="prawda", + false="fałsz", + neither="ani jedno ani drugie", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.HAITIAN: TranslationLiterals( + language=Language.HAITIAN, + cause_word="poukisa", + effect_word="donk sa", + ), Language.ROMANIAN: TranslationLiterals(language=Language.ROMANIAN), - Language.GERMAN: TranslationLiterals(language=Language.GERMAN), + # Some tasks (mlmm) use the latin alphabet, it would be wise to distinguish scripts in future + # Latin script for Serbian + # Language.SERBIAN: TranslationLiterals( + # language=Language.SERBIAN, + # question_word="pitanje", + # answer="odgovor", + # confirmation_word="zar ne", + # yes="da", + # no="ne", + # also="takođe", + # cause_word="jer", + # effect_word="dakle", + # or_word="ili", + # true="tačno", + # false="netačno", + # neither="ništa od navedenog", + # full_stop=".", + # comma=",", + # question_mark="?", + # exclamation_mark="!", + # word_space=" ", + # sentence_space=" ", + # colon=":", + # semicolon=";", + # ), + Language.SERBIAN: TranslationLiterals( + language=Language.SERBIAN, + question_word="питање", + answer="одговор", + confirmation_word="зар не", + yes="да", + no="не", + also="такође", + cause_word="јер", + effect_word="дакле", + or_word="или", + true="тачно", + false="нетачно", + neither="ништа од наведеног", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.CROATIAN: TranslationLiterals( + language=Language.CROATIAN, + question_word="pitanje", + answer="odgovor", + confirmation_word="zar ne", + yes="da", + no="ne", + also="također", + cause_word="jer", + effect_word="dakle", + or_word="ili", + true="točno", + false="netočno", + neither="ništa od navedenog", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.SLOVAK: TranslationLiterals( + language=Language.SLOVAK, + question_word="otázka", + answer="odpověď", + confirmation_word="že áno", + yes="áno", + no="ne", + also="taktiež", + cause_word="pretože", + effect_word="takže", + or_word="alebo", + true="pravda", + false="nepravda", + neither="ani jeden", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.SWEDISH: TranslationLiterals( + language=Language.SWEDISH, + question_word="fråga", + answer="svar", + confirmation_word="eller hur", + yes="ja", + no="nej", + also="också", + cause_word="eftersom", + effect_word="därför att", + or_word="eller", + true="sant", + false="falskt", + neither="ingendera", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.UKRAINIAN: TranslationLiterals( + language=Language.UKRAINIAN, + question_word="питання", + answer="відповідь", + confirmation_word="правда", + yes="так", + no="ні", + also="також", + cause_word="тому що", + effect_word="отже", + or_word="або", + true="правда", + false="неправда", + neither="ні те, ні інше", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.URDU: TranslationLiterals( + language=Language.URDU, + question_word="سوال", + answer="جواب", + confirmation_word="نا", + yes="ہاں", + no="نہیں", + also="اور", + cause_word="کیونکہ", + effect_word="اس لئے", + or_word="یا", + true="درست", + false="غلط", + neither="کوئی نہیں", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.VIETNAMESE: TranslationLiterals( + language=Language.VIETNAMESE, + question_word="Câu hỏi", + answer="Trả lời", + confirmation_word="đúng", + yes="Có", + no="Không", + also="Cũng", + cause_word="vì", + effect_word="do đó", + or_word="hoặc", + true="đúng", + false="sai", + neither="không đúng cũng không sai", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.BASQUE: TranslationLiterals( + language=Language.BASQUE, + question_word="galdera", + answer="erantzuna", + confirmation_word="ezta", + yes="bai", + no="ez", + also="halaber", + cause_word="zaren", + effect_word="horregatik", + or_word="ala", + true="egia", + false="faltsua", + neither="bat ere ez", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.TAMIL: TranslationLiterals( + language=Language.TAMIL, + cause_word="காரணமாக", + effect_word="எனவே", + ), + Language.QUECHUA: TranslationLiterals( + language=Language.QUECHUA, + cause_word="imataq", + effect_word="chaymi", + ), Language.LATIN: TranslationLiterals(language=Language.LATIN), - Language.CZECH: TranslationLiterals(language=Language.CZECH), - Language.DANISH: TranslationLiterals(language=Language.DANISH), - Language.FINNISH: TranslationLiterals(language=Language.FINNISH), - Language.GREEK: TranslationLiterals(language=Language.GREEK), - Language.NORWEGIAN: TranslationLiterals(language=Language.NORWEGIAN), - Language.POLISH: TranslationLiterals(language=Language.POLISH), - Language.SLOVENIAN: TranslationLiterals(language=Language.SLOVENIAN), - Language.DUTCH: TranslationLiterals(language=Language.DUTCH), - Language.JAPANESE: TranslationLiterals(language=Language.JAPANESE), - Language.VIETNAMESE: TranslationLiterals(language=Language.VIETNAMESE), - Language.INDONESIAN: TranslationLiterals(language=Language.INDONESIAN), - Language.PERSIAN: TranslationLiterals(language=Language.PERSIAN), - Language.KOREAN: TranslationLiterals(language=Language.KOREAN), - Language.BENGALI: TranslationLiterals(language=Language.BENGALI), - Language.TAMIL: TranslationLiterals(language=Language.TAMIL), - Language.HUNGARIAN: TranslationLiterals(language=Language.HUNGARIAN), - Language.UKRAINIAN: TranslationLiterals(language=Language.UKRAINIAN), - Language.SLOVAK: TranslationLiterals(language=Language.SLOVAK), - Language.BULGARIAN: TranslationLiterals(language=Language.BULGARIAN), - Language.CATALAN: TranslationLiterals(language=Language.CATALAN), - Language.CROATIAN: TranslationLiterals(language=Language.CROATIAN), - Language.SERBIAN: TranslationLiterals(language=Language.SERBIAN), - Language.LITHUANIAN: TranslationLiterals(language=Language.LITHUANIAN), - Language.ESTONIAN: TranslationLiterals(language=Language.ESTONIAN), - Language.HEBREW: TranslationLiterals(language=Language.HEBREW), - Language.LATVIAN: TranslationLiterals(language=Language.LATVIAN), Language.SERBOCROATIAN: TranslationLiterals(language=Language.SERBOCROATIAN), # Deprecated Language.ALBANIAN: TranslationLiterals(language=Language.ALBANIAN), Language.AZERBAIJANI: TranslationLiterals(language=Language.AZERBAIJANI), - Language.ICELANDIC: TranslationLiterals(language=Language.ICELANDIC), Language.MACEDONIAN: TranslationLiterals(language=Language.MACEDONIAN), Language.GEORGIAN: TranslationLiterals(language=Language.GEORGIAN), Language.GALICIAN: TranslationLiterals(language=Language.GALICIAN), Language.ARMENIAN: TranslationLiterals(language=Language.ARMENIAN), - Language.BASQUE: TranslationLiterals(language=Language.BASQUE), Language.MALAY: TranslationLiterals(language=Language.MALAY), Language.TAGALOG: TranslationLiterals(language=Language.TAGALOG), Language.JAVANESE: TranslationLiterals(language=Language.JAVANESE), @@ -307,7 +816,6 @@ def __getattribute__(self, name: str) -> str: Language.GUJARATI: TranslationLiterals(language=Language.GUJARATI), Language.YORUBA: TranslationLiterals(language=Language.YORUBA), Language.MARATHI: TranslationLiterals(language=Language.MARATHI), - Language.URDU: TranslationLiterals(language=Language.URDU), Language.AMHARIC: TranslationLiterals(language=Language.AMHARIC), Language.MALAYALAM: TranslationLiterals(language=Language.MALAYALAM), Language.KANNADA: TranslationLiterals(language=Language.KANNADA), @@ -349,5 +857,4 @@ def __getattribute__(self, name: str) -> str: Language.SORANI: TranslationLiterals(language=Language.SORANI), Language.CEBUANO: TranslationLiterals(language=Language.CEBUANO), Language.WAR: TranslationLiterals(language=Language.WAR), - Language.SWEDISH: TranslationLiterals(language=Language.SWEDISH), } diff --git a/src/lighteval/utils/language.py b/src/lighteval/utils/language.py index 9474f419e..6fb444933 100644 --- a/src/lighteval/utils/language.py +++ b/src/lighteval/utils/language.py @@ -78,6 +78,7 @@ class Language(Enum): URDU = "urd" AMHARIC = "amh" TELUGU = "tel" + HAITIAN = "hti" MALAYALAM = "mal" KANNADA = "kan" NEPALI = "nep" @@ -113,6 +114,7 @@ class Language(Enum): SOMALI = "som" SANSKRIT = "san" SINDHI = "snd" + QUECHUA = "que" TURKMEN = "tuk" SOUTH_AZERBAIJANI = "azb" SORANI = "ckb" @@ -216,7 +218,7 @@ class Language(Enum): # 'zul': Language.ZULU, # 'bod': Language.TIBETAN, "eng": Language.ENGLISH, - # 'hat': Language.HAITIAN, + "hat": Language.HAITIAN, # 'ilo': Language.ILOCANO, "kaz": Language.KAZAKH, "lit": Language.LITHUANIAN, @@ -241,6 +243,7 @@ class Language(Enum): "ary": Language.ARABIC, "cat": Language.CATALAN, "eus": Language.BASQUE, + "que": Language.QUECHUA, "heb": Language.HEBREW, "isl": Language.ICELANDIC, # 'khk': Language.MONGOLIAN, From fd32651abf8343d5a3ad6411be67ff593079adce Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Thu, 10 Oct 2024 17:46:36 +0200 Subject: [PATCH 2/6] small nits to translation literals --- .../templates/utils/translation_literals.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 956f0d65f..83c463538 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -265,7 +265,7 @@ def __getattribute__(self, name: str) -> str: question_word="pregunta", answer="respuesta", confirmation_word="cierto", - yes="sì", + yes="sí", no="no", also="también", cause_word="porque", @@ -484,7 +484,7 @@ def __getattribute__(self, name: str) -> str: no="tidak", also="juga", cause_word="karena", - effect_word="oleh sebab itu", + effect_word="maka", or_word="atau", true="benar", false="salah", @@ -498,21 +498,20 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), - # TODO: Add Japanese Language.JAPANESE: TranslationLiterals( language=Language.JAPANESE, question_word="質問", answer="回答", - confirmation_word="ね", + confirmation_word="でしょ", yes="はい", no="いいえ", also="また", cause_word="なので", effect_word="なので", - or_word="か/また/あるいは", + or_word="または", true="正解", false="不正解", - neither="またはどちらでもない", + neither="どちらでもない", full_stop="。", comma="、", question_mark="?", @@ -523,7 +522,6 @@ def __getattribute__(self, name: str) -> str: semicolon=";", ), Language.KOREAN: TranslationLiterals( - # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/paws-x/_generate_config.py language=Language.KOREAN, question_word="맞죠", yes="예", @@ -655,10 +653,10 @@ def __getattribute__(self, name: str) -> str: Language.SLOVAK: TranslationLiterals( language=Language.SLOVAK, question_word="otázka", - answer="odpověď", + answer="odpoveď", confirmation_word="že áno", yes="áno", - no="ne", + no="nie", also="taktiež", cause_word="pretože", effect_word="takže", @@ -737,7 +735,7 @@ def __getattribute__(self, name: str) -> str: neither="کوئی نہیں", full_stop=".", comma=",", - question_mark="?", + question_mark="؟", exclamation_mark="!", word_space=" ", sentence_space=" ", @@ -746,12 +744,12 @@ def __getattribute__(self, name: str) -> str: ), Language.VIETNAMESE: TranslationLiterals( language=Language.VIETNAMESE, - question_word="Câu hỏi", - answer="Trả lời", + question_word="câu hỏi", + answer="trả lời", confirmation_word="đúng", - yes="Có", - no="Không", - also="Cũng", + yes="có", + no="không", + also="cũng", cause_word="vì", effect_word="do đó", or_word="hoặc", @@ -775,8 +773,8 @@ def __getattribute__(self, name: str) -> str: yes="bai", no="ez", also="halaber", - cause_word="zaren", - effect_word="horregatik", + cause_word="izan ere", + effect_word="beraz", or_word="ala", true="egia", false="faltsua", From fffca1942c055aedca1d65d79ee7fe8054bf1078 Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Thu, 10 Oct 2024 20:32:40 +0200 Subject: [PATCH 3/6] true/false/or/neither for canaries + nit --- .../templates/utils/translation_literals.py | 48 +++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 83c463538..82b980a7d 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -106,6 +106,10 @@ def __getattribute__(self, name: str) -> str: also="كذلك", cause_word="لأن", effect_word="لذلك", + true="صحيح", + false="خاطئ", + neither="لا هذا ولا ذاك", + or_word="أو", full_stop=".", comma="،", question_mark="؟", @@ -113,6 +117,7 @@ def __getattribute__(self, name: str) -> str: word_space=" ", sentence_space=" ", colon=":", + indices=["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح"], ), Language.SWAHILI: TranslationLiterals( language=Language.SWAHILI, @@ -124,6 +129,10 @@ def __getattribute__(self, name: str) -> str: also="pia", cause_word="kwa sababu", effect_word="kwa hiyo", + true="kweli", + false="uongo", + neither="hakuna kati ya hizo", + or_word="au", full_stop=".", comma=",", question_mark="?", @@ -144,6 +153,10 @@ def __getattribute__(self, name: str) -> str: also="de plus", cause_word="parce que", effect_word="donc", + or_word="ou", + true="vrai", + false="faux", + neither="aucun des deux", full_stop=".", comma=",", question_mark="?", @@ -162,6 +175,10 @@ def __getattribute__(self, name: str) -> str: also="అలాగే", cause_word="ఎందుకంటే", effect_word="అందువలన", + or_word="లేదా", + true="నిజం", + false="తప్పు", + neither="ఏదీ కాదు", full_stop=".", comma=",", question_mark="?", @@ -169,6 +186,7 @@ def __getattribute__(self, name: str) -> str: word_space=" ", sentence_space=" ", colon=":", + indices=["ఎ", "బి", "సి", "డి", "ఇ"], ), Language.HINDI: TranslationLiterals( language=Language.HINDI, @@ -180,6 +198,10 @@ def __getattribute__(self, name: str) -> str: also="साथ ही", cause_word="क्योंकि", effect_word="इसलिए", + true="सत्य", + false="असत्य", + neither="न तो दोनों ", + or_word="या", full_stop="।", comma=",", question_mark="?", @@ -187,6 +209,7 @@ def __getattribute__(self, name: str) -> str: word_space=" ", sentence_space=" ", colon=":", + indices=["क", "ख", "ग", "घ", "ङ", "च"], ), Language.CHINESE: TranslationLiterals( language=Language.CHINESE, @@ -198,6 +221,10 @@ def __getattribute__(self, name: str) -> str: also="而且", cause_word="因为", effect_word="所以", + true="真", + false="假", + neither="都不是", + or_word="或", full_stop="。", comma=",", question_mark="?", @@ -205,6 +232,7 @@ def __getattribute__(self, name: str) -> str: word_space="", sentence_space="", colon=":", + indices=["①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩"], ), Language.RUSSIAN: TranslationLiterals( language=Language.RUSSIAN, @@ -216,6 +244,10 @@ def __getattribute__(self, name: str) -> str: also="к тому же", cause_word="потому что", effect_word="поэтому", + true="истина", + false="ложь", + neither="ни то ни другое", + or_word="или", full_stop=".", comma=",", question_mark="?", @@ -223,6 +255,7 @@ def __getattribute__(self, name: str) -> str: word_space=" ", sentence_space=" ", colon=":", + indices=["А", "Б", "В", "Г", "Д", "Е"], ), Language.THAI: TranslationLiterals( language=Language.THAI, @@ -234,6 +267,10 @@ def __getattribute__(self, name: str) -> str: also="และ", cause_word="เพราะ", effect_word="ดังนั้น", + true="истина", + false="ложь", + neither="ни то ни другое", + or_word="или", full_stop=".", comma=",", question_mark="?", @@ -241,6 +278,7 @@ def __getattribute__(self, name: str) -> str: word_space="", sentence_space=" ", colon=":", + indices=["๑", "๒", "๓", "๔", "๕", "๖", "๗", "๘", "๙", "๐"], ), Language.TURKISH: TranslationLiterals( language=Language.TURKISH, @@ -252,6 +290,10 @@ def __getattribute__(self, name: str) -> str: also="ayrıca", cause_word="çünkü", effect_word="bu yüzden", + true="doğru", + false="yanlış", + neither="hiçbiri", + or_word="veya", full_stop=".", comma=",", question_mark="?", @@ -733,14 +775,14 @@ def __getattribute__(self, name: str) -> str: true="درست", false="غلط", neither="کوئی نہیں", - full_stop=".", - comma=",", + full_stop="۔", + comma="،", question_mark="؟", exclamation_mark="!", word_space=" ", sentence_space=" ", colon=":", - semicolon=";", + semicolon="؛", ), Language.VIETNAMESE: TranslationLiterals( language=Language.VIETNAMESE, From 10f86159a41ef1fd49c0effe485a0d37bd5539ac Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Thu, 10 Oct 2024 23:35:39 +0200 Subject: [PATCH 4/6] last nits --- .../templates/utils/translation_literals.py | 772 +++++++++--------- 1 file changed, 382 insertions(+), 390 deletions(-) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 82b980a7d..4df356111 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -74,28 +74,9 @@ def __getattribute__(self, name: str) -> str: TRANSLATION_LITERALS: dict[Language, TranslationLiterals] = { - Language.ENGLISH: TranslationLiterals( - language=Language.ENGLISH, - question_word="question", - answer="answer", - confirmation_word="right", - yes="yes", - no="no", - also="also", - cause_word="because", - effect_word="therefore", - true="true", - false="false", - neither="neither", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - or_word="or", - ), + Language.AFRIKAANS: TranslationLiterals(language=Language.AFRIKAANS), + Language.ALBANIAN: TranslationLiterals(language=Language.ALBANIAN), + Language.AMHARIC: TranslationLiterals(language=Language.AMHARIC), Language.ARABIC: TranslationLiterals( language=Language.ARABIC, question_word="سؤال", @@ -119,20 +100,24 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح"], ), - Language.SWAHILI: TranslationLiterals( - language=Language.SWAHILI, - question_word="swali", - answer="jibu", - confirmation_word="sahihi", - yes="ndiyo", - no="hapana", - also="pia", - cause_word="kwa sababu", - effect_word="kwa hiyo", - true="kweli", - false="uongo", - neither="hakuna kati ya hizo", - or_word="au", + Language.ARMENIAN: TranslationLiterals(language=Language.ARMENIAN), + Language.ASSAMESE: TranslationLiterals(language=Language.ASSAMESE), + Language.AZERBAIJANI: TranslationLiterals(language=Language.AZERBAIJANI), + Language.BASHKIR: TranslationLiterals(language=Language.BASHKIR), + Language.BASQUE: TranslationLiterals( + language=Language.BASQUE, + question_word="galdera", + answer="erantzuna", + confirmation_word="ezta", + yes="bai", + no="ez", + also="halaber", + cause_word="izan ere", + effect_word="beraz", + or_word="ala", + true="egia", + false="faltsua", + neither="bat ere ez", full_stop=".", comma=",", question_mark="?", @@ -140,82 +125,20 @@ def __getattribute__(self, name: str) -> str: word_space=" ", sentence_space=" ", colon=":", + semicolon=";", ), - # Based on https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mgsm/utils.py + Language.BELARUSIAN: TranslationLiterals(language=Language.BELARUSIAN), Language.BENGALI: TranslationLiterals(language=Language.BENGALI, question_word="প্রশ্ন"), - Language.FRENCH: TranslationLiterals( - language=Language.FRENCH, - question_word="question", - answer="réponse", - confirmation_word="n'est-ce pas", - yes="oui", - no="non", - also="de plus", - cause_word="parce que", - effect_word="donc", - or_word="ou", - true="vrai", - false="faux", - neither="aucun des deux", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - ), - Language.TELUGU: TranslationLiterals( - language=Language.TELUGU, - question_word="ప్రశ్న", - answer="జవాబు", - confirmation_word="కదా", - yes="అవును", - no="కాదు", - also="అలాగే", - cause_word="ఎందుకంటే", - effect_word="అందువలన", - or_word="లేదా", - true="నిజం", - false="తప్పు", - neither="ఏదీ కాదు", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - indices=["ఎ", "బి", "సి", "డి", "ఇ"], - ), - Language.HINDI: TranslationLiterals( - language=Language.HINDI, - question_word="सवाल", - answer="उत्तर", - confirmation_word="है ना", - yes="हाँ", - no="नहीं", - also="साथ ही", - cause_word="क्योंकि", - effect_word="इसलिए", - true="सत्य", - false="असत्य", - neither="न तो दोनों ", - or_word="या", - full_stop="।", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - indices=["क", "ख", "ग", "घ", "ङ", "च"], - ), + Language.BIHARI: TranslationLiterals(language=Language.BIHARI), # Deprecated + Language.BOSNIAN: TranslationLiterals(language=Language.BOSNIAN), + Language.BRETON: TranslationLiterals(language=Language.BRETON), + Language.BURMESE: TranslationLiterals(language=Language.BURMESE), + Language.CEBUANO: TranslationLiterals(language=Language.CEBUANO), Language.CHINESE: TranslationLiterals( language=Language.CHINESE, question_word="问题", answer="答案", - confirmation_word="是不是", + confirmation_word="对吗", yes="是的", no="不是", also="而且", @@ -234,157 +157,20 @@ def __getattribute__(self, name: str) -> str: colon=":", indices=["①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩"], ), - Language.RUSSIAN: TranslationLiterals( - language=Language.RUSSIAN, - question_word="вопрос", - answer="ответ", - confirmation_word="не так ли", - yes="да", - no="нет", - also="к тому же", - cause_word="потому что", - effect_word="поэтому", - true="истина", - false="ложь", - neither="ни то ни другое", - or_word="или", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - indices=["А", "Б", "В", "Г", "Д", "Е"], - ), - Language.THAI: TranslationLiterals( - language=Language.THAI, - question_word="คำถาม", - answer="คำตอบ", - confirmation_word="ใช่ไหม", - yes="ใช่", - no="ไม่", - also="และ", - cause_word="เพราะ", - effect_word="ดังนั้น", - true="истина", - false="ложь", - neither="ни то ни другое", - or_word="или", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space="", - sentence_space=" ", - colon=":", - indices=["๑", "๒", "๓", "๔", "๕", "๖", "๗", "๘", "๙", "๐"], - ), - Language.TURKISH: TranslationLiterals( - language=Language.TURKISH, - question_word="soru", - answer="cevap", - confirmation_word="değil mi", - yes="evet", - no="hayır", - also="ayrıca", - cause_word="çünkü", - effect_word="bu yüzden", - true="doğru", - false="yanlış", - neither="hiçbiri", - or_word="veya", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - ), - Language.SPANISH: TranslationLiterals( - language=Language.SPANISH, - question_word="pregunta", - answer="respuesta", - confirmation_word="cierto", - yes="sí", - no="no", - also="también", - cause_word="porque", - effect_word="por lo tanto", - or_word="o", - true="verdadero", - false="falso", - neither="ninguno", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - semicolon=";", - ), - Language.PORTUGUESE: TranslationLiterals( - language=Language.PORTUGUESE, - question_word="pergunta", - answer="resposta", - confirmation_word="certo", - yes="sim", - no="não", - also="adicionalmente", - cause_word="porque", - effect_word="logo", - or_word="ou", - true="verdadeiro", - false="falso", - neither="nenhum", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - semicolon=";", - ), - Language.ITALIAN: TranslationLiterals( - language=Language.ITALIAN, - question_word="domanda", - answer="risposta", - confirmation_word="vero", - yes="sì", - no="no", - also="inoltre", - cause_word="perchè", - effect_word="quindi", - or_word="o", - true="vero", - false="falso", - neither="nessuno dei due", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - semicolon=";", - ), - Language.GERMAN: TranslationLiterals( - language=Language.GERMAN, - question_word="frage", - answer="antwort", - confirmation_word="richtig", - yes="ja", - no="nein", - also="auch", - cause_word="weil", - effect_word="deshalb", - or_word="oder", - true="wahr", - false="falsch", - neither="weder noch", + Language.CROATIAN: TranslationLiterals( + language=Language.CROATIAN, + question_word="pitanje", + answer="odgovor", + confirmation_word="zar ne", + yes="da", + no="ne", + also="također", + cause_word="jer", + effect_word="dakle", + or_word="ili", + true="točno", + false="netočno", + neither="ništa od navedenog", full_stop=".", comma=",", question_mark="?", @@ -418,6 +204,7 @@ def __getattribute__(self, name: str) -> str: semicolon=";", ), Language.DANISH: TranslationLiterals(language=Language.DANISH), + Language.DIVEHI: TranslationLiterals(language=Language.DIVEHI), Language.DUTCH: TranslationLiterals( language=Language.DUTCH, question_word="vraag", @@ -441,9 +228,32 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.ENGLISH: TranslationLiterals( + language=Language.ENGLISH, + question_word="question", + answer="answer", + confirmation_word="right", + yes="yes", + no="no", + also="also", + cause_word="because", + effect_word="therefore", + true="true", + false="false", + neither="neither", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + or_word="or", + ), + Language.ESPERANTO: TranslationLiterals(language=Language.ESPERANTO), Language.ESTONIAN: TranslationLiterals( + # From https://github.com/EleutherAI/lm-evaluation-harness/blob/0845b588303f1f59af98dd1c5bdbd78a9e75a1e2/lm_eval/tasks/xcopa/utils.py language=Language.ESTONIAN, - # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/xcopa/utils.py cause_word="sest", effect_word="seetõttu", ), @@ -470,6 +280,53 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.FRENCH: TranslationLiterals( + language=Language.FRENCH, + question_word="question", + answer="réponse", + confirmation_word="n'est-ce pas", + yes="oui", + no="non", + also="de plus", + cause_word="parce que", + effect_word="donc", + or_word="ou", + true="vrai", + false="faux", + neither="aucun des deux", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + ), + Language.GALICIAN: TranslationLiterals(language=Language.GALICIAN), + Language.GEORGIAN: TranslationLiterals(language=Language.GEORGIAN), + Language.GERMAN: TranslationLiterals( + language=Language.GERMAN, + question_word="frage", + answer="antwort", + confirmation_word="richtig", + yes="ja", + no="nein", + also="auch", + cause_word="weil", + effect_word="deshalb", + or_word="oder", + true="wahr", + false="falsch", + neither="weder noch", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), Language.GREEK: TranslationLiterals( language=Language.GREEK, question_word="ερώτηση", @@ -493,6 +350,36 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon="·", ), + Language.GUJARATI: TranslationLiterals(language=Language.GUJARATI), + Language.HAITIAN: TranslationLiterals( + # From https://github.com/EleutherAI/lm-evaluation-harness/blob/0845b588303f1f59af98dd1c5bdbd78a9e75a1e2/lm_eval/tasks/xcopa/utils.py + language=Language.HAITIAN, + cause_word="poukisa", + effect_word="donk sa", + ), + Language.HINDI: TranslationLiterals( + language=Language.HINDI, + question_word="सवाल", + answer="उत्तर", + confirmation_word="है ना", + yes="हाँ", + no="नहीं", + also="साथ ही", + cause_word="क्योंकि", + effect_word="इसलिए", + true="सत्य", + false="असत्य", + neither="न तो यह, न वह", + or_word="या", + full_stop="।", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + indices=["क", "ख", "ग", "घ", "ङ", "च"], + ), Language.HUNGARIAN: TranslationLiterals( language=Language.HUNGARIAN, question_word="kérdés", @@ -540,16 +427,40 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.IRISH: TranslationLiterals(language=Language.IRISH), + Language.ITALIAN: TranslationLiterals( + language=Language.ITALIAN, + question_word="domanda", + answer="risposta", + confirmation_word="vero", + yes="sì", + no="no", + also="inoltre", + cause_word="perchè", + effect_word="quindi", + or_word="o", + true="vero", + false="falso", + neither="nessuno dei due", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), Language.JAPANESE: TranslationLiterals( language=Language.JAPANESE, question_word="質問", answer="回答", - confirmation_word="でしょ", + confirmation_word="でしょうか", yes="はい", no="いいえ", also="また", cause_word="なので", - effect_word="なので", + effect_word="なぜなら", or_word="または", true="正解", false="不正解", @@ -563,12 +474,28 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.JAVANESE: TranslationLiterals(language=Language.JAVANESE), + Language.KANNADA: TranslationLiterals(language=Language.KANNADA), + Language.KAZAKH: TranslationLiterals(language=Language.KAZAKH), + Language.KHMER: TranslationLiterals(language=Language.KHMER), + Language.KIRGHIZ: TranslationLiterals(language=Language.KIRGHIZ), Language.KOREAN: TranslationLiterals( language=Language.KOREAN, - question_word="맞죠", + confirmation_word="맞죠", yes="예", no="아니오", ), + Language.KURDISH: TranslationLiterals(language=Language.KURDISH), + Language.LAO: TranslationLiterals(language=Language.LAO), + Language.LATIN: TranslationLiterals(language=Language.LATIN), + Language.LUXEMBOURGISH: TranslationLiterals(language=Language.LUXEMBOURGISH), + Language.MACEDONIAN: TranslationLiterals(language=Language.MACEDONIAN), + Language.MALAGASY: TranslationLiterals(language=Language.MALAGASY), + Language.MALAY: TranslationLiterals(language=Language.MALAY), + Language.MALAYALAM: TranslationLiterals(language=Language.MALAYALAM), + Language.MALTESE: TranslationLiterals(language=Language.MALTESE), + Language.MARATHI: TranslationLiterals(language=Language.MARATHI), + Language.NEPALI: TranslationLiterals(language=Language.NEPALI), Language.NORWEGIAN: TranslationLiterals( language=Language.NORWEGIAN, question_word="spørsmål", @@ -592,6 +519,10 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.NORWEGIAN_NYNORSK: TranslationLiterals(language=Language.NORWEGIAN_NYNORSK), + Language.OCCITAN: TranslationLiterals(language=Language.OCCITAN), + Language.ORIYA: TranslationLiterals(language=Language.ORIYA), + Language.PASHTO: TranslationLiterals(language=Language.PASHTO), Language.POLISH: TranslationLiterals( language=Language.POLISH, question_word="pytanie", @@ -613,18 +544,65 @@ def __getattribute__(self, name: str) -> str: word_space=" ", sentence_space=" ", colon=":", - semicolon=";", - ), - Language.HAITIAN: TranslationLiterals( - language=Language.HAITIAN, - cause_word="poukisa", - effect_word="donk sa", + semicolon=";", + ), + Language.PORTUGUESE: TranslationLiterals( + language=Language.PORTUGUESE, + question_word="pergunta", + answer="resposta", + confirmation_word="certo", + yes="sim", + no="não", + also="adicionalmente", + cause_word="porque", + effect_word="logo", + or_word="ou", + true="verdadeiro", + false="falso", + neither="nenhum", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.PUNJABI: TranslationLiterals(language=Language.PUNJABI), + Language.QUECHUA: TranslationLiterals( + # From https://github.com/EleutherAI/lm-evaluation-harness/blob/0845b588303f1f59af98dd1c5bdbd78a9e75a1e2/lm_eval/tasks/xcopa/utils.py + language=Language.QUECHUA, + cause_word="imataq", + effect_word="chaymi", + ), + Language.ROMANIAN: TranslationLiterals(language=Language.ROMANIAN), + Language.RUSSIAN: TranslationLiterals( + language=Language.RUSSIAN, + question_word="вопрос", + answer="ответ", + confirmation_word="не так ли", + yes="да", + no="нет", + also="к тому же", + cause_word="потому что", + effect_word="поэтому", + true="истина", + false="ложь", + neither="ни то ни другое", + or_word="или", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + indices=["А", "Б", "В", "Г", "Д", "Е"], ), - Language.ROMANIAN: TranslationLiterals(language=Language.ROMANIAN), - # Some tasks (mlmm) use the latin alphabet, it would be wise to distinguish scripts in future - # Latin script for Serbian - # Language.SERBIAN: TranslationLiterals( - # language=Language.SERBIAN, + Language.SANSKRIT: TranslationLiterals(language=Language.SANSKRIT), + # Latin serbian script for future when separating scipts + # Language.SERBIAN_LATIN: TranslationLiterals(language=Language.SERBIAN_LATIN, # question_word="pitanje", # answer="odgovor", # confirmation_word="zar ne", @@ -637,14 +615,6 @@ def __getattribute__(self, name: str) -> str: # true="tačno", # false="netačno", # neither="ništa od navedenog", - # full_stop=".", - # comma=",", - # question_mark="?", - # exclamation_mark="!", - # word_space=" ", - # sentence_space=" ", - # colon=":", - # semicolon=";", # ), Language.SERBIAN: TranslationLiterals( language=Language.SERBIAN, @@ -669,29 +639,9 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), - Language.CROATIAN: TranslationLiterals( - language=Language.CROATIAN, - question_word="pitanje", - answer="odgovor", - confirmation_word="zar ne", - yes="da", - no="ne", - also="također", - cause_word="jer", - effect_word="dakle", - or_word="ili", - true="točno", - false="netočno", - neither="ništa od navedenog", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - semicolon=";", - ), + Language.SERBOCROATIAN: TranslationLiterals(language=Language.SERBOCROATIAN), # Deprecated + Language.SINDHI: TranslationLiterals(language=Language.SINDHI), + Language.SINHALA: TranslationLiterals(language=Language.SINHALA), Language.SLOVAK: TranslationLiterals( language=Language.SLOVAK, question_word="otázka", @@ -715,6 +665,54 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.SOMALI: TranslationLiterals(language=Language.SOMALI), + Language.SORANI: TranslationLiterals(language=Language.SORANI), + Language.SOUTH_AZERBAIJANI: TranslationLiterals(language=Language.SOUTH_AZERBAIJANI), + Language.SPANISH: TranslationLiterals( + language=Language.SPANISH, + question_word="pregunta", + answer="respuesta", + confirmation_word="cierto", + yes="sí", + no="no", + also="también", + cause_word="porque", + effect_word="por lo tanto", + or_word="o", + true="verdadero", + false="falso", + neither="ninguno", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), + Language.SWAHILI: TranslationLiterals( + language=Language.SWAHILI, + question_word="swali", + answer="jibu", + confirmation_word="sahihi", + yes="ndiyo", + no="hapana", + also="pia", + cause_word="kwa sababu", + effect_word="kwa hiyo", + true="kweli", + false="uongo", + neither="hakuna kati ya hizo", + or_word="au", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + ), Language.SWEDISH: TranslationLiterals( language=Language.SWEDISH, question_word="fråga", @@ -738,6 +736,84 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), + Language.TAGALOG: TranslationLiterals(language=Language.TAGALOG), + Language.TAJIK: TranslationLiterals(language=Language.TAJIK), + Language.TAMIL: TranslationLiterals( + # From https://github.com/EleutherAI/lm-evaluation-harness/blob/0845b588303f1f59af98dd1c5bdbd78a9e75a1e2/lm_eval/tasks/xcopa/utils.py + language=Language.TAMIL, + cause_word="காரணமாக", + effect_word="எனவே", + ), + Language.TATAR: TranslationLiterals(language=Language.TATAR), + Language.TELUGU: TranslationLiterals( + language=Language.TELUGU, + question_word="ప్రశ్న", + answer="జవాబు", + confirmation_word="కదా", + yes="అవును", + no="కాదు", + also="అలాగే", + cause_word="ఎందుకంటే", + effect_word="అందువలన", + or_word="లేదా", + true="నిజం", + false="తప్పు", + neither="ఏదీ కాదు", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + indices=["ఎ", "బి", "సి", "డి", "ఇ"], + ), + Language.THAI: TranslationLiterals( + language=Language.THAI, + question_word="คำถาม", + answer="คำตอบ", + confirmation_word="ใช่ไหม", + yes="ใช่", + no="ไม่", + also="และ", + cause_word="เพราะ", + effect_word="ดังนั้น", + true="จริง", + false="เท็จ", + neither="ไม่ใช่ทั้งสองอย่าง", + or_word="หรือ", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space="", + sentence_space=" ", + colon=":", + indices=["๑", "๒", "๓", "๔", "๕", "๖", "๗", "๘", "๙", "๐"], + ), + Language.TURKISH: TranslationLiterals( + language=Language.TURKISH, + question_word="soru", + answer="cevap", + confirmation_word="değil mi", + yes="evet", + no="hayır", + also="ayrıca", + cause_word="çünkü", + effect_word="bu yüzden", + true="doğru", + false="yanlış", + neither="hiçbiri", + or_word="veya", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + ), + Language.TURKMEN: TranslationLiterals(language=Language.TURKMEN), Language.UKRAINIAN: TranslationLiterals( language=Language.UKRAINIAN, question_word="питання", @@ -784,6 +860,7 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon="؛", ), + Language.UZBEK: TranslationLiterals(language=Language.UZBEK), Language.VIETNAMESE: TranslationLiterals( language=Language.VIETNAMESE, question_word="câu hỏi", @@ -807,94 +884,9 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), - Language.BASQUE: TranslationLiterals( - language=Language.BASQUE, - question_word="galdera", - answer="erantzuna", - confirmation_word="ezta", - yes="bai", - no="ez", - also="halaber", - cause_word="izan ere", - effect_word="beraz", - or_word="ala", - true="egia", - false="faltsua", - neither="bat ere ez", - full_stop=".", - comma=",", - question_mark="?", - exclamation_mark="!", - word_space=" ", - sentence_space=" ", - colon=":", - semicolon=";", - ), - Language.TAMIL: TranslationLiterals( - language=Language.TAMIL, - cause_word="காரணமாக", - effect_word="எனவே", - ), - Language.QUECHUA: TranslationLiterals( - language=Language.QUECHUA, - cause_word="imataq", - effect_word="chaymi", - ), - Language.LATIN: TranslationLiterals(language=Language.LATIN), - Language.SERBOCROATIAN: TranslationLiterals(language=Language.SERBOCROATIAN), # Deprecated - Language.ALBANIAN: TranslationLiterals(language=Language.ALBANIAN), - Language.AZERBAIJANI: TranslationLiterals(language=Language.AZERBAIJANI), - Language.MACEDONIAN: TranslationLiterals(language=Language.MACEDONIAN), - Language.GEORGIAN: TranslationLiterals(language=Language.GEORGIAN), - Language.GALICIAN: TranslationLiterals(language=Language.GALICIAN), - Language.ARMENIAN: TranslationLiterals(language=Language.ARMENIAN), - Language.MALAY: TranslationLiterals(language=Language.MALAY), - Language.TAGALOG: TranslationLiterals(language=Language.TAGALOG), - Language.JAVANESE: TranslationLiterals(language=Language.JAVANESE), - Language.PUNJABI: TranslationLiterals(language=Language.PUNJABI), - Language.BIHARI: TranslationLiterals(language=Language.BIHARI), # Deprecated - Language.GUJARATI: TranslationLiterals(language=Language.GUJARATI), - Language.YORUBA: TranslationLiterals(language=Language.YORUBA), - Language.MARATHI: TranslationLiterals(language=Language.MARATHI), - Language.AMHARIC: TranslationLiterals(language=Language.AMHARIC), - Language.MALAYALAM: TranslationLiterals(language=Language.MALAYALAM), - Language.KANNADA: TranslationLiterals(language=Language.KANNADA), - Language.NEPALI: TranslationLiterals(language=Language.NEPALI), - Language.KAZAKH: TranslationLiterals(language=Language.KAZAKH), - Language.BELARUSIAN: TranslationLiterals(language=Language.BELARUSIAN), - Language.BURMESE: TranslationLiterals(language=Language.BURMESE), - Language.ESPERANTO: TranslationLiterals(language=Language.ESPERANTO), - Language.UZBEK: TranslationLiterals(language=Language.UZBEK), - Language.KHMER: TranslationLiterals(language=Language.KHMER), - Language.TAJIK: TranslationLiterals(language=Language.TAJIK), + Language.WAR: TranslationLiterals(language=Language.WAR), Language.WELSH: TranslationLiterals(language=Language.WELSH), - Language.NORWEGIAN_NYNORSK: TranslationLiterals(language=Language.NORWEGIAN_NYNORSK), - Language.BOSNIAN: TranslationLiterals(language=Language.BOSNIAN), - Language.SINHALA: TranslationLiterals(language=Language.SINHALA), - Language.TATAR: TranslationLiterals(language=Language.TATAR), - Language.AFRIKAANS: TranslationLiterals(language=Language.AFRIKAANS), - Language.ORIYA: TranslationLiterals(language=Language.ORIYA), - Language.KIRGHIZ: TranslationLiterals(language=Language.KIRGHIZ), - Language.IRISH: TranslationLiterals(language=Language.IRISH), - Language.OCCITAN: TranslationLiterals(language=Language.OCCITAN), - Language.KURDISH: TranslationLiterals(language=Language.KURDISH), - Language.LAO: TranslationLiterals(language=Language.LAO), - Language.LUXEMBOURGISH: TranslationLiterals(language=Language.LUXEMBOURGISH), - Language.BASHKIR: TranslationLiterals(language=Language.BASHKIR), Language.WESTERN_FRISIAN: TranslationLiterals(language=Language.WESTERN_FRISIAN), - Language.PASHTO: TranslationLiterals(language=Language.PASHTO), - Language.MALTESE: TranslationLiterals(language=Language.MALTESE), - Language.BRETON: TranslationLiterals(language=Language.BRETON), - Language.ASSAMESE: TranslationLiterals(language=Language.ASSAMESE), - Language.MALAGASY: TranslationLiterals(language=Language.MALAGASY), - Language.DIVEHI: TranslationLiterals(language=Language.DIVEHI), Language.YIDDISH: TranslationLiterals(language=Language.YIDDISH), - Language.SOMALI: TranslationLiterals(language=Language.SOMALI), - Language.SANSKRIT: TranslationLiterals(language=Language.SANSKRIT), - Language.SINDHI: TranslationLiterals(language=Language.SINDHI), - Language.TURKMEN: TranslationLiterals(language=Language.TURKMEN), - Language.SOUTH_AZERBAIJANI: TranslationLiterals(language=Language.SOUTH_AZERBAIJANI), - Language.SORANI: TranslationLiterals(language=Language.SORANI), - Language.CEBUANO: TranslationLiterals(language=Language.CEBUANO), - Language.WAR: TranslationLiterals(language=Language.WAR), + Language.YORUBA: TranslationLiterals(language=Language.YORUBA), } From 941094ad8d1c74e3b0edeeec0c35f9fa73bf9b1e Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Fri, 11 Oct 2024 00:12:49 +0200 Subject: [PATCH 5/6] nit --- src/lighteval/tasks/templates/nli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/tasks/templates/nli.py b/src/lighteval/tasks/templates/nli.py index cea3d0f87..842460306 100644 --- a/src/lighteval/tasks/templates/nli.py +++ b/src/lighteval/tasks/templates/nli.py @@ -244,7 +244,7 @@ def prompt_fn(line: dict, task_name: str): choices_str = f"{translation_literals.comma}{translation_literals.word_space}".join(rearanged_labales[:-1]) hypothesis = f"{hypothesis.rstrip(PUNCT)}{translation_literals.sentence_space}{choices_str}{translation_literals.word_space}{translation_literals.or_word}{translation_literals.word_space}{rearanged_labales[-1]}{translation_literals.question_mark}" - # (hynky1999): Ideally we would not compute logprobs of the Yes/No/Neither in CF fomulation. However as of right now lighteval doesn't allow to + # (hynky1999): Ideally we would not compute logprobs of the Yes/No/Also in CF fomulation. However as of right now lighteval doesn't allow to # use multi-context. row = { "instruction": input_data.get("instruction", ""), From 272bb6b21b355e35abe32c7df8a05630d5f0acdf Mon Sep 17 00:00:00 2001 From: Hynek Kydlicek Date: Fri, 11 Oct 2024 16:47:06 +0200 Subject: [PATCH 6/6] add missing languags to translation literals --- src/lighteval/tasks/templates/utils/translation_literals.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 4df356111..0b306e1d0 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -132,7 +132,9 @@ def __getattribute__(self, name: str) -> str: Language.BIHARI: TranslationLiterals(language=Language.BIHARI), # Deprecated Language.BOSNIAN: TranslationLiterals(language=Language.BOSNIAN), Language.BRETON: TranslationLiterals(language=Language.BRETON), + Language.BULGARIAN: TranslationLiterals(language=Language.BULGARIAN), Language.BURMESE: TranslationLiterals(language=Language.BURMESE), + Language.CATALAN: TranslationLiterals(language=Language.CATALAN), Language.CEBUANO: TranslationLiterals(language=Language.CEBUANO), Language.CHINESE: TranslationLiterals( language=Language.CHINESE, @@ -357,6 +359,7 @@ def __getattribute__(self, name: str) -> str: cause_word="poukisa", effect_word="donk sa", ), + Language.HEBREW: TranslationLiterals(language=Language.HEBREW), Language.HINDI: TranslationLiterals( language=Language.HINDI, question_word="सवाल", @@ -488,6 +491,8 @@ def __getattribute__(self, name: str) -> str: Language.KURDISH: TranslationLiterals(language=Language.KURDISH), Language.LAO: TranslationLiterals(language=Language.LAO), Language.LATIN: TranslationLiterals(language=Language.LATIN), + Language.LATVIAN: TranslationLiterals(language=Language.LATVIAN), + Language.LITHUANIAN: TranslationLiterals(language=Language.LITHUANIAN), Language.LUXEMBOURGISH: TranslationLiterals(language=Language.LUXEMBOURGISH), Language.MACEDONIAN: TranslationLiterals(language=Language.MACEDONIAN), Language.MALAGASY: TranslationLiterals(language=Language.MALAGASY), @@ -523,6 +528,7 @@ def __getattribute__(self, name: str) -> str: Language.OCCITAN: TranslationLiterals(language=Language.OCCITAN), Language.ORIYA: TranslationLiterals(language=Language.ORIYA), Language.PASHTO: TranslationLiterals(language=Language.PASHTO), + Language.PERSIAN: TranslationLiterals(language=Language.PERSIAN), Language.POLISH: TranslationLiterals( language=Language.POLISH, question_word="pytanie",