From f3010c17c3c111f2786f37511ec0ebc0941d659e Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Wed, 23 Oct 2024 11:55:53 +0400 Subject: [PATCH 01/11] Update arabic_evals.py Add new Arabic benchmarks and update existing tasks - Renamed `arabic_mmlu` to `arabic_mmlu_mt` to highlight its machine-translated origin. - Added new benchmarks: `arabic_mmlu` ArabicMMLU (https://arxiv.org/abs/2402.12840), `arabic_mmlu_ht` (human-translated), and `MadinahQA` from MBZUAI. As well as `arabic_mmmlu` (OpenAI MMMLU), and `AraTrust` a trustworthiness benchmark for Arabic LLMs (https://arxiv.org/abs/2403.09017). - Enhanced prompt functions for better flexibility in answer options. --- community_tasks/arabic_evals.py | 403 +++++++++++++++++++++++++++++--- 1 file changed, 368 insertions(+), 35 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 323120cd7..1f95396fb 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -39,9 +39,97 @@ LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] # fmt: on -# ARABIC MMLU ## +# ArabicMMLU # fmt: off ARABIC_MMLU_SUBSETS = [ + "All", "Islamic Studies", "Islamic Studies (Middle School)", "Islamic Studies (Primary School)", "Islamic Studies (High School)", "Driving Test", + "Natural Science (Middle School)", "Natural Science (Primary School)", "History (Middle School)", "History (Primary School)", "History (High School)", "General Knowledge", + "General Knowledge (Middle School)", "General Knowledge (Primary School)", "Law (Professional)", "Physics (High School)", "Social Science (Middle School)", + "Social Science (Primary School)", "Management (University)", "Arabic Language (Middle School)", "Arabic Language (Primary School)", "Arabic Language (High School)", "Political Science (University)", + "Philosophy (High School)", "Accounting (University)", "Computer Science (Middle School)", "Computer Science (Primary School)", "Computer Science (High School)", "Computer Science (University)", + "Geography (Middle School)", "Geography (Primary School)", "Geography (High School)", "Math (Primary School)", "Biology (High School)", "Economics (Middle School)", + "Economics (High School)", "Economics (University)", "Arabic Language (General)", "Arabic Language (Grammar)", "Civics (Middle School)", "Civics (High School)" +] +# fmt: on + + +def arabic_mmlu_pfn(line, task_name: str = None): + instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + + # Define the mapping from Latin to Arabic letters + latin_to_arabic = { + 'A': 'أ', + 'B': 'ب', + 'C': 'ج', + 'D': 'د', + 'E': 'هـ' + } + + # Create a list of valid choices with corresponding Arabic keys + choices = [] + valid_keys_latin = [] + valid_keys_arabic = [] + + # Enumerate through the options and append the valid ones + for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']): + option = line.get(f"Option {idx + 1}") + if option: # Check if option is not null + choices.append(option) + valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) + valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter + + # Find the correct index for the answer key in the Arabic version + answer_index = valid_keys_latin.index(line["Answer Key"]) + + # Construct the query with Arabic letters + query = f"{instruction}{line['Question']}\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=valid_keys_arabic, # Return only valid choices (Arabic keys) + gold_index=answer_index, # Correct index in the valid Arabic keys + instruction=instruction, + target_for_fewshot_sorting=valid_keys_arabic[answer_index], # Correct answer in Arabic form + ) + + +class CustomArabicMMLUTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=arabic_mmlu_pfn, + hf_repo="MBZUAI/ArabicMMLU", + metric=[Metrics.loglikelihood_acc_norm], + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=["dev"], + few_shots_select="sequential", + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +ARABIC_MMLU_TASKS = [ + CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS +] + + +# ARABIC MMLU HT ## +# fmt: off +ARABIC_MMLU_HT_SUBSETS = [ "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", @@ -54,13 +142,76 @@ # fmt: on -def mmlu_arabic(line, task_name: str = None): - topic = line["subject"] - instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" +def arabic_mmlu_ht_pfn(line, task_name: str = None): + instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + choices = line["choices"] + answer_index = line["answer"] # It is an int reflecting the index of correct answer in line["choices"] + + query = f"{instruction}{line['question']}\n" + query += "".join([f"{idx}. {choice}\n" for idx, choice in enumerate(choices, start=1)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=[str(i) for i in range(1, len(choices) + 1)], # List of strings instead of ints + gold_index=answer_index, + instruction=instruction, + target_for_fewshot_sorting=str(answer_index), # Assuming it's sorted based on the number + ) + + +class CustomArabicMMLUHTTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=arabic_mmlu_ht_pfn, + hf_repo="MBZUAI/human_translated_arabic_mmlu", + metric=[Metrics.loglikelihood_acc_norm], + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +ARABIC_MMLU_HT_TASKS = [ + CustomArabicMMLUHTTask(name=f"arabic_mmlu_ht:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_HT_SUBSETS +] + +# ARABIC MMLU MT ## +# fmt: off +ARABIC_MMLU_MT_SUBSETS = [ + "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", + "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering", + "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", + "high_school_european_history", "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics", "high_school_mathematics", + "high_school_microeconomics", "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history", "high_school_world_history", + "human_aging", "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing", "medical_genetics", + "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting", "professional_law", + "professional_medicine", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions" +] +# fmt: on + + +def arabic_mmlu_mt_pfn(line, task_name: str = None): + instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" choices = [line["A"], line["B"], line["C"], line["D"]] # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, # it will then be applied to arabic letters - gold_ix = LETTER_INDICES.index(line["answer"]) + answer_index = LETTER_INDICES.index(line["answer"]) # line["answer"] is the correct answer. That's why we need to index it ! query = f"{instruction}{line['question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) @@ -70,13 +221,13 @@ def mmlu_arabic(line, task_name: str = None): task_name=task_name, query=query, choices=LETTER_INDICES_AR[:4], - gold_index=gold_ix, + gold_index=answer_index, instruction=instruction, - target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix], + target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], ) -class CustomArabicMMLUTask(LightevalTaskConfig): +class CustomArabicMMLUMTTask(LightevalTaskConfig): def __init__( self, name, @@ -85,7 +236,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function=mmlu_arabic, + prompt_function=arabic_mmlu_mt_pfn, hf_repo="OALL/Arabic_MMLU", metric=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "dev"], @@ -102,10 +253,49 @@ def __init__( ) -ARABIC_MMLU_TASKS = [ - CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS +ARABIC_MMLU_MT_TASKS = [ + CustomArabicMMLUMTTask(name=f"arabic_mmlu_mt:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_MT_SUBSETS ] + +def arabic_mmmlu_pfn(line, task_name: str = None): + instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" + choices = [line["A"], line["B"], line["C"], line["D"]] + # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, + # it will then be applied to arabic letters + answer_index = LETTER_INDICES.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it ! + + query = f"{instruction}{line['Question']}\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES_AR[:4], + gold_index=answer_index, + instruction=instruction, + target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], + ) + + +# ARABIC MMMLU (OpenAI) ## +arabic_mmmlu_task = LightevalTaskConfig( + name="arabic_mmmlu", + prompt_function=arabic_mmmlu_pfn, + suite=["community"], + hf_repo="openai/MMMLU", + hf_subset="AR_XY", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + metric=[Metrics.loglikelihood_acc_norm], + trust_dataset=True, + version=0, +) + + # ACVA ## # fmt: off ACVA_SUBSETS = [ @@ -121,7 +311,7 @@ def __init__( # fmt: on -def acva(line, task_name: str = None): +def acva_pfn(line, task_name: str = None): question = line["question"] answer = line["answer"] @@ -142,7 +332,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function=acva, + prompt_function=acva_pfn, hf_repo="OALL/ACVA", metric=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "validation"], @@ -162,7 +352,67 @@ def __init__( ACVA_TASKS = [CustomACVATask(name=f"acva:{subset}", hf_subset=subset) for subset in ACVA_SUBSETS] -def arabic_exams(line, task_name: str = None): +# AraTrust ## +# fmt: off +ARATRUST_SUBSETS = [ + "Trustfulness", "MentalHealth", "PhysicalHealth", "Offensive", "Ethics", "Privacy", "Unfairness", "Illegal", +] +# fmt: on + + +def aratrust_pfn(line, task_name: str = None): + instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n" + choices = [line["A"], line["B"], line["C"]] + # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, + # it will then be applied to arabic letters + answer_index = LETTER_INDICES_AR.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it ! + + query = f"{instruction}{line['Question']}\n" + query += "".join([f"{choice}\n" for choice in choices]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES_AR[:3], + gold_index=answer_index, + instruction=instruction, + target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], + ) + + +class CustomAraTrustTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=aratrust_pfn, + hf_repo="asas-ai/AraTrust-categorized", + metric=[Metrics.f1_score], # Following the paper (AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic)[https://arxiv.org/abs/2403.09017] + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + suite=["community"], + generation_size=-1, + stop_sequence=[], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +ARATRUST_TASKS = [ + CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS +] + + +def arabic_exams_pfn(line, task_name: str = None): topic = line["subject"] question = line["question"] choices = [line["A"], line["B"], line["C"], line["D"]] @@ -188,7 +438,7 @@ def arabic_exams(line, task_name: str = None): # ARABIC EXAMS ## arabic_exams_task = LightevalTaskConfig( name="arabic_exams", - prompt_function=arabic_exams, + prompt_function=arabic_exams_pfn, suite=["community"], hf_repo="OALL/Arabic_EXAMS", hf_subset="default", @@ -212,7 +462,7 @@ def arabic_exams(line, task_name: str = None): # fmt: on -def alghafa_prompt(line, task_name: str = None): +def alghafa_pfn(line, task_name: str = None): question = line["query"] answer_index = int(line["label"]) # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' @@ -244,7 +494,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", metric=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "validation"], @@ -256,6 +506,7 @@ def __init__( stop_sequence=None, output_regex=None, frozen=False, + trust_dataset=True, version=0, ) @@ -266,7 +517,7 @@ def __init__( # race_ar race_ar_task = LightevalTaskConfig( name="race_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="race_ar", @@ -283,7 +534,7 @@ def __init__( # piqa_ar piqa_ar_task = LightevalTaskConfig( name="piqa_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="piqa_ar", @@ -300,7 +551,7 @@ def __init__( # arc_easy_ar arc_easy_ar_task = LightevalTaskConfig( name="arc_easy_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_easy_ar", @@ -317,7 +568,7 @@ def __init__( # arc_challenge_okapi_ar arc_challenge_okapi_ar_task = LightevalTaskConfig( name="arc_challenge_okapi_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_challenge_okapi_ar", @@ -334,7 +585,7 @@ def __init__( # mmlu_okapi_ar mmlu_okapi_ar_task = LightevalTaskConfig( name="mmlu_okapi_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="mmlu_okapi_ar", @@ -351,7 +602,7 @@ def __init__( # openbook_qa_ext_ar openbook_qa_ext_ar_task = LightevalTaskConfig( name="openbook_qa_ext_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="openbook_qa_ext_ar", @@ -366,9 +617,7 @@ def __init__( # boolq_ar - - -def boolq_prompt_arabic(line, task_name: str = None): +def boolq_arabic_pfn(line, task_name: str = None): question = line["question"] passage = line["passage"] answer = "نعم" if line["answer"] else "لا" @@ -393,7 +642,7 @@ def boolq_prompt_arabic(line, task_name: str = None): boolq_ar_task = LightevalTaskConfig( name="boolq_ar", - prompt_function=boolq_prompt_arabic, + prompt_function=boolq_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="boolq_ar", @@ -408,7 +657,7 @@ def boolq_prompt_arabic(line, task_name: str = None): # copa_ext_ar -def copa_prompt_arabic(line, task_name: str = None): +def copa_arabic_pfn(line, task_name: str = None): premise = line["premise"] choices = [line["choice1"], line["choice2"]] question_map = {"cause": "لأن", "effect": "لذلك"} @@ -429,7 +678,7 @@ def copa_prompt_arabic(line, task_name: str = None): copa_ext_ar_task = LightevalTaskConfig( name="copa_ext_ar", - prompt_function=copa_prompt_arabic, + prompt_function=copa_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="copa_ext_ar", @@ -444,7 +693,7 @@ def copa_prompt_arabic(line, task_name: str = None): # hellaswag_okapi_ar -def hellaswag_prompt_arabic(line, task_name: str = None): +def hellaswag_arabic_pfn(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets endings = [ re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"]) @@ -474,7 +723,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): hellaswag_okapi_ar_task = LightevalTaskConfig( name="hellaswag_okapi_ar", - prompt_function=hellaswag_prompt_arabic, + prompt_function=hellaswag_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="hellaswag_okapi_ar", @@ -489,7 +738,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): # toxigen_ar -def toxigen_prompt_arabic(line, task_name: str = None): +def toxigen_arabic_pfn(line, task_name: str = None): text = line["text"] label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".' @@ -512,7 +761,7 @@ def toxigen_prompt_arabic(line, task_name: str = None): toxigen_ar_task = LightevalTaskConfig( name="toxigen_ar", - prompt_function=toxigen_prompt_arabic, + prompt_function=toxigen_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="toxigen_ar", @@ -527,7 +776,7 @@ def toxigen_prompt_arabic(line, task_name: str = None): # sciq_ar -def sciq_prompt_arabic(line, task_name: str = None): +def sciq_arabic_pfn(line, task_name: str = None): support = line["support"] question = line["question"] correct_answer = line["correct_answer"] @@ -564,7 +813,7 @@ def sciq_prompt_arabic(line, task_name: str = None): sciq_ar_task = LightevalTaskConfig( name="sciq_ar", - prompt_function=sciq_prompt_arabic, + prompt_function=sciq_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="sciq_ar", @@ -578,10 +827,94 @@ def sciq_prompt_arabic(line, task_name: str = None): ) +# madinah_qa +# fmt: off +MADINAH_QA_SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"] +# fmt: on + + +def madinah_qa_pfn(line, task_name: str = None): + instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + + # Define the mapping from Latin to Arabic letters + latin_to_arabic = { + 'A': 'أ', + 'B': 'ب', + 'C': 'ج', + 'D': 'د', + 'E': 'هـ' + } + + # Create a list of valid choices with corresponding Arabic keys + choices = [] + valid_keys_latin = [] + valid_keys_arabic = [] + + # Enumerate through the options and append the valid ones + for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']): + option = line.get(f"Option {idx + 1}") + if option: # Check if option is not null + choices.append(option) + valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) + valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter + + # Find the correct index for the answer key in the Arabic version + answer_index = valid_keys_latin.index(line["Answer Key"]) + + query = f"{instruction}{line['Question']}\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=choices, + gold_index=answer_index, # Correct index in the valid keys + instruction=instruction, + target_for_fewshot_sorting=valid_keys_latin[answer_index], # Correct answer in Latin form + ) + + +class CustomMadinahQATask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=madinah_qa_pfn, + hf_repo="MBZUAI/MadinahQA", + metric=[Metrics.loglikelihood_acc_norm], + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=["dev"], + few_shots_select="sequential", + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +MADINAH_QA_TASKS = [ + CustomMadinahQATask(name=f"madinah_qa:{subset}", hf_subset=subset) for subset in MADINAH_QA_SUBSETS +] + + TASKS_TABLE = ( ARABIC_MMLU_TASKS + + ARABIC_MMLU_HT_TASKS + + ARABIC_MMLU_MT_TASKS + + [arabic_mmmlu_task] + ACVA_TASKS + ALGHAFA_TASKS + + ARATRUST_TASKS + + MADINAH_QA_TASKS + [arabic_exams_task] + [race_ar_task] + [piqa_ar_task] From 64d4e116d86e796d3a59e32293f60e27e6cc1904 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Wed, 23 Oct 2024 11:57:51 +0400 Subject: [PATCH 02/11] Update and rename OALL_tasks.txt to OALL_v1_tasks.txt Rename file to refelect that it is v1 leaderboard tasks --- examples/tasks/OALL_tasks.txt | 136 ------------------------------- examples/tasks/OALL_v1_tasks.txt | 136 +++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 136 deletions(-) delete mode 100644 examples/tasks/OALL_tasks.txt create mode 100644 examples/tasks/OALL_v1_tasks.txt diff --git a/examples/tasks/OALL_tasks.txt b/examples/tasks/OALL_tasks.txt deleted file mode 100644 index 346d062c6..000000000 --- a/examples/tasks/OALL_tasks.txt +++ /dev/null @@ -1,136 +0,0 @@ -lighteval|xstory_cloze:ar|0|0 -community|arabic_mmlu:abstract_algebra|5|1 -community|arabic_mmlu:anatomy|5|1 -community|arabic_mmlu:astronomy|5|1 -community|arabic_mmlu:business_ethics|5|1 -community|arabic_mmlu:clinical_knowledge|5|1 -community|arabic_mmlu:college_biology|5|1 -community|arabic_mmlu:college_chemistry|5|1 -community|arabic_mmlu:college_computer_science|5|1 -community|arabic_mmlu:college_mathematics|5|1 -community|arabic_mmlu:college_medicine|5|1 -community|arabic_mmlu:college_physics|5|1 -community|arabic_mmlu:computer_security|5|1 -community|arabic_mmlu:conceptual_physics|5|1 -community|arabic_mmlu:econometrics|5|1 -community|arabic_mmlu:electrical_engineering|5|1 -community|arabic_mmlu:elementary_mathematics|5|1 -community|arabic_mmlu:formal_logic|5|1 -community|arabic_mmlu:global_facts|5|1 -community|arabic_mmlu:high_school_biology|5|1 -community|arabic_mmlu:high_school_chemistry|5|1 -community|arabic_mmlu:high_school_computer_science|5|1 -community|arabic_mmlu:high_school_european_history|5|1 -community|arabic_mmlu:high_school_geography|5|1 -community|arabic_mmlu:high_school_government_and_politics|5|1 -community|arabic_mmlu:high_school_macroeconomics|5|1 -community|arabic_mmlu:high_school_mathematics|5|1 -community|arabic_mmlu:high_school_microeconomics|5|1 -community|arabic_mmlu:high_school_physics|5|1 -community|arabic_mmlu:high_school_psychology|5|1 -community|arabic_mmlu:high_school_statistics|5|1 -community|arabic_mmlu:high_school_us_history|5|1 -community|arabic_mmlu:high_school_world_history|5|1 -community|arabic_mmlu:human_aging|5|1 -community|arabic_mmlu:human_sexuality|5|1 -community|arabic_mmlu:international_law|5|1 -community|arabic_mmlu:jurisprudence|5|1 -community|arabic_mmlu:logical_fallacies|5|1 -community|arabic_mmlu:machine_learning|5|1 -community|arabic_mmlu:management|5|1 -community|arabic_mmlu:marketing|5|1 -community|arabic_mmlu:medical_genetics|5|1 -community|arabic_mmlu:miscellaneous|5|1 -community|arabic_mmlu:moral_disputes|5|1 -community|arabic_mmlu:moral_scenarios|5|1 -community|arabic_mmlu:nutrition|5|1 -community|arabic_mmlu:philosophy|5|1 -community|arabic_mmlu:prehistory|5|1 -community|arabic_mmlu:professional_accounting|5|1 -community|arabic_mmlu:professional_law|5|1 -community|arabic_mmlu:professional_medicine|5|1 -community|arabic_mmlu:professional_psychology|5|1 -community|arabic_mmlu:public_relations|5|1 -community|arabic_mmlu:security_studies|5|1 -community|arabic_mmlu:sociology|5|1 -community|arabic_mmlu:us_foreign_policy|5|1 -community|arabic_mmlu:virology|5|1 -community|arabic_mmlu:world_religions|5|1 -community|arabic_exams|5|1 -community|acva:Algeria|5|1 -community|acva:Ancient_Egypt|5|1 -community|acva:Arab_Empire|5|1 -community|acva:Arabic_Architecture|5|1 -community|acva:Arabic_Art|5|1 -community|acva:Arabic_Astronomy|5|1 -community|acva:Arabic_Calligraphy|5|1 -community|acva:Arabic_Ceremony|5|1 -community|acva:Arabic_Clothing|5|1 -community|acva:Arabic_Culture|5|1 -community|acva:Arabic_Food|5|1 -community|acva:Arabic_Funeral|5|1 -community|acva:Arabic_Geography|5|1 -community|acva:Arabic_History|5|1 -community|acva:Arabic_Language_Origin|5|1 -community|acva:Arabic_Literature|5|1 -community|acva:Arabic_Math|5|1 -community|acva:Arabic_Medicine|5|1 -community|acva:Arabic_Music|5|1 -community|acva:Arabic_Ornament|5|1 -community|acva:Arabic_Philosophy|5|1 -community|acva:Arabic_Physics_and_Chemistry|5|1 -community|acva:Arabic_Wedding|5|1 -community|acva:Bahrain|5|1 -community|acva:Comoros|5|1 -community|acva:Egypt_modern|5|1 -community|acva:InfluenceFromAncientEgypt|5|1 -community|acva:InfluenceFromByzantium|5|1 -community|acva:InfluenceFromChina|5|1 -community|acva:InfluenceFromGreece|5|1 -community|acva:InfluenceFromIslam|5|1 -community|acva:InfluenceFromPersia|5|1 -community|acva:InfluenceFromRome|5|1 -community|acva:Iraq|5|1 -community|acva:Islam_Education|5|1 -community|acva:Islam_branches_and_schools|5|1 -community|acva:Islamic_law_system|5|1 -community|acva:Jordan|5|1 -community|acva:Kuwait|5|1 -community|acva:Lebanon|5|1 -community|acva:Libya|5|1 -community|acva:Mauritania|5|1 -community|acva:Mesopotamia_civilization|5|1 -community|acva:Morocco|5|1 -community|acva:Oman|5|1 -community|acva:Palestine|5|1 -community|acva:Qatar|5|1 -community|acva:Saudi_Arabia|5|1 -community|acva:Somalia|5|1 -community|acva:Sudan|5|1 -community|acva:Syria|5|1 -community|acva:Tunisia|5|1 -community|acva:United_Arab_Emirates|5|1 -community|acva:Yemen|5|1 -community|acva:communication|5|1 -community|acva:computer_and_phone|5|1 -community|acva:daily_life|5|1 -community|acva:entertainment|5|1 -community|alghafa:mcq_exams_test_ar|5|1 -community|alghafa:meta_ar_dialects|5|1 -community|alghafa:meta_ar_msa|5|1 -community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 -community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1 -community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_task|5|1 -community|alghafa:multiple_choice_sentiment_task|5|1 -community|race_ar|5|1 -community|piqa_ar|5|1 -community|arc_easy_ar|5|1 -community|arc_challenge_okapi_ar|5|1 -community|openbook_qa_ext_ar|5|1 -community|boolq_ar|5|1 -community|copa_ext_ar|5|1 -community|hellaswag_okapi_ar|5|1 -community|toxigen_ar|5|1 -community|sciq_ar|5|1 diff --git a/examples/tasks/OALL_v1_tasks.txt b/examples/tasks/OALL_v1_tasks.txt new file mode 100644 index 000000000..08e9a51cd --- /dev/null +++ b/examples/tasks/OALL_v1_tasks.txt @@ -0,0 +1,136 @@ +lighteval|xstory_cloze:ar|0|0 +community|arabic_mmlu_mt:abstract_algebra|0|0 +community|arabic_mmlu_mt:anatomy|0|0 +community|arabic_mmlu_mt:astronomy|0|0 +community|arabic_mmlu_mt:business_ethics|0|0 +community|arabic_mmlu_mt:clinical_knowledge|0|0 +community|arabic_mmlu_mt:college_biology|0|0 +community|arabic_mmlu_mt:college_chemistry|0|0 +community|arabic_mmlu_mt:college_computer_science|0|0 +community|arabic_mmlu_mt:college_mathematics|0|0 +community|arabic_mmlu_mt:college_medicine|0|0 +community|arabic_mmlu_mt:college_physics|0|0 +community|arabic_mmlu_mt:computer_security|0|0 +community|arabic_mmlu_mt:conceptual_physics|0|0 +community|arabic_mmlu_mt:econometrics|0|0 +community|arabic_mmlu_mt:electrical_engineering|0|0 +community|arabic_mmlu_mt:elementary_mathematics|0|0 +community|arabic_mmlu_mt:formal_logic|0|0 +community|arabic_mmlu_mt:global_facts|0|0 +community|arabic_mmlu_mt:high_school_biology|0|0 +community|arabic_mmlu_mt:high_school_chemistry|0|0 +community|arabic_mmlu_mt:high_school_computer_science|0|0 +community|arabic_mmlu_mt:high_school_european_history|0|0 +community|arabic_mmlu_mt:high_school_geography|0|0 +community|arabic_mmlu_mt:high_school_government_and_politics|0|0 +community|arabic_mmlu_mt:high_school_macroeconomics|0|0 +community|arabic_mmlu_mt:high_school_mathematics|0|0 +community|arabic_mmlu_mt:high_school_microeconomics|0|0 +community|arabic_mmlu_mt:high_school_physics|0|0 +community|arabic_mmlu_mt:high_school_psychology|0|0 +community|arabic_mmlu_mt:high_school_statistics|0|0 +community|arabic_mmlu_mt:high_school_us_history|0|0 +community|arabic_mmlu_mt:high_school_world_history|0|0 +community|arabic_mmlu_mt:human_aging|0|0 +community|arabic_mmlu_mt:human_sexuality|0|0 +community|arabic_mmlu_mt:international_law|0|0 +community|arabic_mmlu_mt:jurisprudence|0|0 +community|arabic_mmlu_mt:logical_fallacies|0|0 +community|arabic_mmlu_mt:machine_learning|0|0 +community|arabic_mmlu_mt:management|0|0 +community|arabic_mmlu_mt:marketing|0|0 +community|arabic_mmlu_mt:medical_genetics|0|0 +community|arabic_mmlu_mt:miscellaneous|0|0 +community|arabic_mmlu_mt:moral_disputes|0|0 +community|arabic_mmlu_mt:moral_scenarios|0|0 +community|arabic_mmlu_mt:nutrition|0|0 +community|arabic_mmlu_mt:philosophy|0|0 +community|arabic_mmlu_mt:prehistory|0|0 +community|arabic_mmlu_mt:professional_accounting|0|0 +community|arabic_mmlu_mt:professional_law|0|0 +community|arabic_mmlu_mt:professional_medicine|0|0 +community|arabic_mmlu_mt:professional_psychology|0|0 +community|arabic_mmlu_mt:public_relations|0|0 +community|arabic_mmlu_mt:security_studies|0|0 +community|arabic_mmlu_mt:sociology|0|0 +community|arabic_mmlu_mt:us_foreign_policy|0|0 +community|arabic_mmlu_mt:virology|0|0 +community|arabic_mmlu_mt:world_religions|0|0 +community|arabic_exams|0|0 +community|acva:Algeria|0|0 +community|acva:Ancient_Egypt|0|0 +community|acva:Arab_Empire|0|0 +community|acva:Arabic_Architecture|0|0 +community|acva:Arabic_Art|0|0 +community|acva:Arabic_Astronomy|0|0 +community|acva:Arabic_Calligraphy|0|0 +community|acva:Arabic_Ceremony|0|0 +community|acva:Arabic_Clothing|0|0 +community|acva:Arabic_Culture|0|0 +community|acva:Arabic_Food|0|0 +community|acva:Arabic_Funeral|0|0 +community|acva:Arabic_Geography|0|0 +community|acva:Arabic_History|0|0 +community|acva:Arabic_Language_Origin|0|0 +community|acva:Arabic_Literature|0|0 +community|acva:Arabic_Math|0|0 +community|acva:Arabic_Medicine|0|0 +community|acva:Arabic_Music|0|0 +community|acva:Arabic_Ornament|0|0 +community|acva:Arabic_Philosophy|0|0 +community|acva:Arabic_Physics_and_Chemistry|0|0 +community|acva:Arabic_Wedding|0|0 +community|acva:Bahrain|0|0 +community|acva:Comoros|0|0 +community|acva:Egypt_modern|0|0 +community|acva:InfluenceFromAncientEgypt|0|0 +community|acva:InfluenceFromByzantium|0|0 +community|acva:InfluenceFromChina|0|0 +community|acva:InfluenceFromGreece|0|0 +community|acva:InfluenceFromIslam|0|0 +community|acva:InfluenceFromPersia|0|0 +community|acva:InfluenceFromRome|0|0 +community|acva:Iraq|0|0 +community|acva:Islam_Education|0|0 +community|acva:Islam_branches_and_schools|0|0 +community|acva:Islamic_law_system|0|0 +community|acva:Jordan|0|0 +community|acva:Kuwait|0|0 +community|acva:Lebanon|0|0 +community|acva:Libya|0|0 +community|acva:Mauritania|0|0 +community|acva:Mesopotamia_civilization|0|0 +community|acva:Morocco|0|0 +community|acva:Oman|0|0 +community|acva:Palestine|0|0 +community|acva:Qatar|0|0 +community|acva:Saudi_Arabia|0|0 +community|acva:Somalia|0|0 +community|acva:Sudan|0|0 +community|acva:Syria|0|0 +community|acva:Tunisia|0|0 +community|acva:United_Arab_Emirates|0|0 +community|acva:Yemen|0|0 +community|acva:communication|0|0 +community|acva:computer_and_phone|0|0 +community|acva:daily_life|0|0 +community|acva:entertainment|0|0 +community|alghafa:mcq_exams_test_ar|0|0 +community|alghafa:meta_ar_dialects|0|0 +community|alghafa:meta_ar_msa|0|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_task|0|0 +community|alghafa:multiple_choice_sentiment_task|0|0 +community|race_ar|0|0 +community|piqa_ar|0|0 +community|arc_easy_ar|0|0 +community|arc_challenge_okapi_ar|0|0 +community|openbook_qa_ext_ar|0|0 +community|boolq_ar|0|0 +community|copa_ext_ar|0|0 +community|hellaswag_okapi_ar|0|0 +community|toxigen_ar|0|0 +community|sciq_ar|0|0 From f2596d5031261fb6063c6543d037297ea3f3c307 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Wed, 23 Oct 2024 11:59:02 +0400 Subject: [PATCH 03/11] Create OALL_v2_tasks.txt Tasks for v2 of OALL --- examples/tasks/OALL_v2_tasks.txt | 117 +++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 examples/tasks/OALL_v2_tasks.txt diff --git a/examples/tasks/OALL_v2_tasks.txt b/examples/tasks/OALL_v2_tasks.txt new file mode 100644 index 000000000..fc1b4f7e9 --- /dev/null +++ b/examples/tasks/OALL_v2_tasks.txt @@ -0,0 +1,117 @@ +community|alghafa:meta_ar_dialects|0|0 +community|alghafa:meta_ar_msa|0|0 +community|alghafa:mcq_exams_test_ar|0|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_task|0|0 +community|alghafa:multiple_choice_sentiment_task|0|0 +community|arabic_exams|0|0 +community|arabic_mmlu:Islamic Studies|0|0 +community|arabic_mmlu:Islamic Studies (Middle School)|0|0 +community|arabic_mmlu:Islamic Studies (Primary School)|0|0 +community|arabic_mmlu:Islamic Studies (High School)|0|0 +community|arabic_mmlu:Driving Test|0|0 +community|arabic_mmlu:Natural Science (Middle School)|0|0 +community|arabic_mmlu:Natural Science (Primary School)|0|0 +community|arabic_mmlu:History (Middle School)|0|0 +community|arabic_mmlu:History (Primary School)|0|0 +community|arabic_mmlu:History (High School)|0|0 +community|arabic_mmlu:General Knowledge|0|0 +community|arabic_mmlu:General Knowledge (Middle School)|0|0 +community|arabic_mmlu:General Knowledge (Primary School)|0|0 +community|arabic_mmlu:Law (Professional)|0|0 +community|arabic_mmlu:Physics (High School)|0|0 +community|arabic_mmlu:Social Science (Middle School)|0|0 +community|arabic_mmlu:Social Science (Primary School)|0|0 +community|arabic_mmlu:Management (University)|0|0 +community|arabic_mmlu:Arabic Language (Middle School)|0|0 +community|arabic_mmlu:Arabic Language (Primary School)|0|0 +community|arabic_mmlu:Arabic Language (High School)|0|0 +community|arabic_mmlu:Political Science (University)|0|0 +community|arabic_mmlu:Philosophy (High School)|0|0 +community|arabic_mmlu:Accounting (University)|0|0 +community|arabic_mmlu:Computer Science (Middle School)|0|0 +community|arabic_mmlu:Computer Science (Primary School)|0|0 +community|arabic_mmlu:Computer Science (High School)|0|0 +community|arabic_mmlu:Computer Science (University)|0|0 +community|arabic_mmlu:Geography (Middle School)|0|0 +community|arabic_mmlu:Geography (Primary School)|0|0 +community|arabic_mmlu:Geography (High School)|0|0 +community|arabic_mmlu:Math (Primary School)|0|0 +community|arabic_mmlu:Biology (High School)|0|0 +community|arabic_mmlu:Economics (Middle School)|0|0 +community|arabic_mmlu:Economics (High School)|0|0 +community|arabic_mmlu:Economics (University)|0|0 +community|arabic_mmlu:Arabic Language (General)|0|0 +community|arabic_mmlu:Arabic Language (Grammar)|0|0 +community|arabic_mmlu:Civics (Middle School)|0|0 +community|arabic_mmlu:Civics (High School)|0|0 +community|madinah_qa:Arabic Language (General)|0|0 +community|madinah_qa:Arabic Language (Grammar)|0|0 +community|aratrust:Trustfulness|0|0 +community|aratrust:MentalHealth|0|0 +community|aratrust:PhysicalHealth|0|0 +community|aratrust:Offensive|0|0 +community|aratrust:Ethics|0|0 +community|aratrust:Privacy|0|0 +community|aratrust:Unfairness|0|0 +community|aratrust:Illegal|0|0 +community|arabic_mmlu_ht:abstract_algebra|0|0 +community|arabic_mmlu_ht:anatomy|0|0 +community|arabic_mmlu_ht:astronomy|0|0 +community|arabic_mmlu_ht:business_ethics|0|0 +community|arabic_mmlu_ht:clinical_knowledge|0|0 +community|arabic_mmlu_ht:college_biology|0|0 +community|arabic_mmlu_ht:college_chemistry|0|0 +community|arabic_mmlu_ht:college_computer_science|0|0 +community|arabic_mmlu_ht:college_mathematics|0|0 +community|arabic_mmlu_ht:college_medicine|0|0 +community|arabic_mmlu_ht:college_physics|0|0 +community|arabic_mmlu_ht:computer_security|0|0 +community|arabic_mmlu_ht:conceptual_physics|0|0 +community|arabic_mmlu_ht:econometrics|0|0 +community|arabic_mmlu_ht:electrical_engineering|0|0 +community|arabic_mmlu_ht:elementary_mathematics|0|0 +community|arabic_mmlu_ht:formal_logic|0|0 +community|arabic_mmlu_ht:global_facts|0|0 +community|arabic_mmlu_ht:high_school_biology|0|0 +community|arabic_mmlu_ht:high_school_chemistry|0|0 +community|arabic_mmlu_ht:high_school_computer_science|0|0 +community|arabic_mmlu_ht:high_school_european_history|0|0 +community|arabic_mmlu_ht:high_school_geography|0|0 +community|arabic_mmlu_ht:high_school_government_and_politics|0|0 +community|arabic_mmlu_ht:high_school_macroeconomics|0|0 +community|arabic_mmlu_ht:high_school_mathematics|0|0 +community|arabic_mmlu_ht:high_school_microeconomics|0|0 +community|arabic_mmlu_ht:high_school_physics|0|0 +community|arabic_mmlu_ht:high_school_psychology|0|0 +community|arabic_mmlu_ht:high_school_statistics|0|0 +community|arabic_mmlu_ht:high_school_us_history|0|0 +community|arabic_mmlu_ht:high_school_world_history|0|0 +community|arabic_mmlu_ht:human_aging|0|0 +community|arabic_mmlu_ht:human_sexuality|0|0 +community|arabic_mmlu_ht:international_law|0|0 +community|arabic_mmlu_ht:jurisprudence|0|0 +community|arabic_mmlu_ht:logical_fallacies|0|0 +community|arabic_mmlu_ht:machine_learning|0|0 +community|arabic_mmlu_ht:management|0|0 +community|arabic_mmlu_ht:marketing|0|0 +community|arabic_mmlu_ht:medical_genetics|0|0 +community|arabic_mmlu_ht:miscellaneous|0|0 +community|arabic_mmlu_ht:moral_disputes|0|0 +community|arabic_mmlu_ht:moral_scenarios|0|0 +community|arabic_mmlu_ht:nutrition|0|0 +community|arabic_mmlu_ht:philosophy|0|0 +community|arabic_mmlu_ht:prehistory|0|0 +community|arabic_mmlu_ht:professional_accounting|0|0 +community|arabic_mmlu_ht:professional_law|0|0 +community|arabic_mmlu_ht:professional_medicine|0|0 +community|arabic_mmlu_ht:professional_psychology|0|0 +community|arabic_mmlu_ht:public_relations|0|0 +community|arabic_mmlu_ht:security_studies|0|0 +community|arabic_mmlu_ht:sociology|0|0 +community|arabic_mmlu_ht:us_foreign_policy|0|0 +community|arabic_mmlu_ht:virology|0|0 +community|arabic_mmlu_ht:world_religions|0|0 From a1644720edf6af852a4360739b82633e2b3d5f0b Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Wed, 23 Oct 2024 11:59:44 +0400 Subject: [PATCH 04/11] Update all_arabic_tasks.txt add new and renamed tasks --- examples/tasks/all_arabic_tasks.txt | 379 ++++++++++++++++++---------- 1 file changed, 243 insertions(+), 136 deletions(-) diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt index fa430ed14..8593fa2f8 100644 --- a/examples/tasks/all_arabic_tasks.txt +++ b/examples/tasks/all_arabic_tasks.txt @@ -1,137 +1,244 @@ lighteval|xstory_cloze:ar|0|0 -community|arabic_mmlu:abstract_algebra|5|1 -community|arabic_mmlu:anatomy|5|1 -community|arabic_mmlu:astronomy|5|1 -community|arabic_mmlu:business_ethics|5|1 -community|arabic_mmlu:clinical_knowledge|5|1 -community|arabic_mmlu:college_biology|5|1 -community|arabic_mmlu:college_chemistry|5|1 -community|arabic_mmlu:college_computer_science|5|1 -community|arabic_mmlu:college_mathematics|5|1 -community|arabic_mmlu:college_medicine|5|1 -community|arabic_mmlu:college_physics|5|1 -community|arabic_mmlu:computer_security|5|1 -community|arabic_mmlu:conceptual_physics|5|1 -community|arabic_mmlu:econometrics|5|1 -community|arabic_mmlu:electrical_engineering|5|1 -community|arabic_mmlu:elementary_mathematics|5|1 -community|arabic_mmlu:formal_logic|5|1 -community|arabic_mmlu:global_facts|5|1 -community|arabic_mmlu:high_school_biology|5|1 -community|arabic_mmlu:high_school_chemistry|5|1 -community|arabic_mmlu:high_school_computer_science|5|1 -community|arabic_mmlu:high_school_european_history|5|1 -community|arabic_mmlu:high_school_geography|5|1 -community|arabic_mmlu:high_school_government_and_politics|5|1 -community|arabic_mmlu:high_school_macroeconomics|5|1 -community|arabic_mmlu:high_school_mathematics|5|1 -community|arabic_mmlu:high_school_microeconomics|5|1 -community|arabic_mmlu:high_school_physics|5|1 -community|arabic_mmlu:high_school_psychology|5|1 -community|arabic_mmlu:high_school_statistics|5|1 -community|arabic_mmlu:high_school_us_history|5|1 -community|arabic_mmlu:high_school_world_history|5|1 -community|arabic_mmlu:human_aging|5|1 -community|arabic_mmlu:human_sexuality|5|1 -community|arabic_mmlu:international_law|5|1 -community|arabic_mmlu:jurisprudence|5|1 -community|arabic_mmlu:logical_fallacies|5|1 -community|arabic_mmlu:machine_learning|5|1 -community|arabic_mmlu:management|5|1 -community|arabic_mmlu:marketing|5|1 -community|arabic_mmlu:medical_genetics|5|1 -community|arabic_mmlu:miscellaneous|5|1 -community|arabic_mmlu:moral_disputes|5|1 -community|arabic_mmlu:moral_scenarios|5|1 -community|arabic_mmlu:nutrition|5|1 -community|arabic_mmlu:philosophy|5|1 -community|arabic_mmlu:prehistory|5|1 -community|arabic_mmlu:professional_accounting|5|1 -community|arabic_mmlu:professional_law|5|1 -community|arabic_mmlu:professional_medicine|5|1 -community|arabic_mmlu:professional_psychology|5|1 -community|arabic_mmlu:public_relations|5|1 -community|arabic_mmlu:security_studies|5|1 -community|arabic_mmlu:sociology|5|1 -community|arabic_mmlu:us_foreign_policy|5|1 -community|arabic_mmlu:virology|5|1 -community|arabic_mmlu:world_religions|5|1 -community|arabic_exams|5|1 -community|acva:Algeria|5|1 -community|acva:Ancient_Egypt|5|1 -community|acva:Arab_Empire|5|1 -community|acva:Arabic_Architecture|5|1 -community|acva:Arabic_Art|5|1 -community|acva:Arabic_Astronomy|5|1 -community|acva:Arabic_Calligraphy|5|1 -community|acva:Arabic_Ceremony|5|1 -community|acva:Arabic_Clothing|5|1 -community|acva:Arabic_Culture|5|1 -community|acva:Arabic_Food|5|1 -community|acva:Arabic_Funeral|5|1 -community|acva:Arabic_Geography|5|1 -community|acva:Arabic_History|5|1 -community|acva:Arabic_Language_Origin|5|1 -community|acva:Arabic_Literature|5|1 -community|acva:Arabic_Math|5|1 -community|acva:Arabic_Medicine|5|1 -community|acva:Arabic_Music|5|1 -community|acva:Arabic_Ornament|5|1 -community|acva:Arabic_Philosophy|5|1 -community|acva:Arabic_Physics_and_Chemistry|5|1 -community|acva:Arabic_Wedding|5|1 -community|acva:Bahrain|5|1 -community|acva:Comoros|5|1 -community|acva:Egypt_modern|5|1 -community|acva:InfluenceFromAncientEgypt|5|1 -community|acva:InfluenceFromByzantium|5|1 -community|acva:InfluenceFromChina|5|1 -community|acva:InfluenceFromGreece|5|1 -community|acva:InfluenceFromIslam|5|1 -community|acva:InfluenceFromPersia|5|1 -community|acva:InfluenceFromRome|5|1 -community|acva:Iraq|5|1 -community|acva:Islam_Education|5|1 -community|acva:Islam_branches_and_schools|5|1 -community|acva:Islamic_law_system|5|1 -community|acva:Jordan|5|1 -community|acva:Kuwait|5|1 -community|acva:Lebanon|5|1 -community|acva:Libya|5|1 -community|acva:Mauritania|5|1 -community|acva:Mesopotamia_civilization|5|1 -community|acva:Morocco|5|1 -community|acva:Oman|5|1 -community|acva:Palestine|5|1 -community|acva:Qatar|5|1 -community|acva:Saudi_Arabia|5|1 -community|acva:Somalia|5|1 -community|acva:Sudan|5|1 -community|acva:Syria|5|1 -community|acva:Tunisia|5|1 -community|acva:United_Arab_Emirates|5|1 -community|acva:Yemen|5|1 -community|acva:communication|5|1 -community|acva:computer_and_phone|5|1 -community|acva:daily_life|5|1 -community|acva:entertainment|5|1 -community|alghafa:mcq_exams_test_ar|5|1 -community|alghafa:meta_ar_dialects|5|1 -community|alghafa:meta_ar_msa|5|1 -community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 -community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1 -community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_task|5|1 -community|alghafa:multiple_choice_sentiment_task|5|1 -community|race_ar|5|1 -community|piqa_ar|5|1 -community|arc_easy_ar|5|1 -community|arc_challenge_okapi_ar|5|1 -community|mmlu_okapi_ar|5|1 -community|openbook_qa_ext_ar|5|1 -community|boolq_ar|5|1 -community|copa_ext_ar|5|1 -community|hellaswag_okapi_ar|5|1 -community|toxigen_ar|5|1 -community|sciq_ar|5|1 +community|arabic_exams|0|0 +community|arabic_mmlu_mt:abstract_algebra|0|0 +community|arabic_mmlu_mt:anatomy|0|0 +community|arabic_mmlu_mt:astronomy|0|0 +community|arabic_mmlu_mt:business_ethics|0|0 +community|arabic_mmlu_mt:clinical_knowledge|0|0 +community|arabic_mmlu_mt:college_biology|0|0 +community|arabic_mmlu_mt:college_chemistry|0|0 +community|arabic_mmlu_mt:college_computer_science|0|0 +community|arabic_mmlu_mt:college_mathematics|0|0 +community|arabic_mmlu_mt:college_medicine|0|0 +community|arabic_mmlu_mt:college_physics|0|0 +community|arabic_mmlu_mt:computer_security|0|0 +community|arabic_mmlu_mt:conceptual_physics|0|0 +community|arabic_mmlu_mt:econometrics|0|0 +community|arabic_mmlu_mt:electrical_engineering|0|0 +community|arabic_mmlu_mt:elementary_mathematics|0|0 +community|arabic_mmlu_mt:formal_logic|0|0 +community|arabic_mmlu_mt:global_facts|0|0 +community|arabic_mmlu_mt:high_school_biology|0|0 +community|arabic_mmlu_mt:high_school_chemistry|0|0 +community|arabic_mmlu_mt:high_school_computer_science|0|0 +community|arabic_mmlu_mt:high_school_european_history|0|0 +community|arabic_mmlu_mt:high_school_geography|0|0 +community|arabic_mmlu_mt:high_school_government_and_politics|0|0 +community|arabic_mmlu_mt:high_school_macroeconomics|0|0 +community|arabic_mmlu_mt:high_school_mathematics|0|0 +community|arabic_mmlu_mt:high_school_microeconomics|0|0 +community|arabic_mmlu_mt:high_school_physics|0|0 +community|arabic_mmlu_mt:high_school_psychology|0|0 +community|arabic_mmlu_mt:high_school_statistics|0|0 +community|arabic_mmlu_mt:high_school_us_history|0|0 +community|arabic_mmlu_mt:high_school_world_history|0|0 +community|arabic_mmlu_mt:human_aging|0|0 +community|arabic_mmlu_mt:human_sexuality|0|0 +community|arabic_mmlu_mt:international_law|0|0 +community|arabic_mmlu_mt:jurisprudence|0|0 +community|arabic_mmlu_mt:logical_fallacies|0|0 +community|arabic_mmlu_mt:machine_learning|0|0 +community|arabic_mmlu_mt:management|0|0 +community|arabic_mmlu_mt:marketing|0|0 +community|arabic_mmlu_mt:medical_genetics|0|0 +community|arabic_mmlu_mt:miscellaneous|0|0 +community|arabic_mmlu_mt:moral_disputes|0|0 +community|arabic_mmlu_mt:moral_scenarios|0|0 +community|arabic_mmlu_mt:nutrition|0|0 +community|arabic_mmlu_mt:philosophy|0|0 +community|arabic_mmlu_mt:prehistory|0|0 +community|arabic_mmlu_mt:professional_accounting|0|0 +community|arabic_mmlu_mt:professional_law|0|0 +community|arabic_mmlu_mt:professional_medicine|0|0 +community|arabic_mmlu_mt:professional_psychology|0|0 +community|arabic_mmlu_mt:public_relations|0|0 +community|arabic_mmlu_mt:security_studies|0|0 +community|arabic_mmlu_mt:sociology|0|0 +community|arabic_mmlu_mt:us_foreign_policy|0|0 +community|arabic_mmlu_mt:virology|0|0 +community|arabic_mmlu_mt:world_religions|0|0 +community|acva:Algeria|0|0 +community|acva:Ancient_Egypt|0|0 +community|acva:Arab_Empire|0|0 +community|acva:Arabic_Architecture|0|0 +community|acva:Arabic_Art|0|0 +community|acva:Arabic_Astronomy|0|0 +community|acva:Arabic_Calligraphy|0|0 +community|acva:Arabic_Ceremony|0|0 +community|acva:Arabic_Clothing|0|0 +community|acva:Arabic_Culture|0|0 +community|acva:Arabic_Food|0|0 +community|acva:Arabic_Funeral|0|0 +community|acva:Arabic_Geography|0|0 +community|acva:Arabic_History|0|0 +community|acva:Arabic_Language_Origin|0|0 +community|acva:Arabic_Literature|0|0 +community|acva:Arabic_Math|0|0 +community|acva:Arabic_Medicine|0|0 +community|acva:Arabic_Music|0|0 +community|acva:Arabic_Ornament|0|0 +community|acva:Arabic_Philosophy|0|0 +community|acva:Arabic_Physics_and_Chemistry|0|0 +community|acva:Arabic_Wedding|0|0 +community|acva:Bahrain|0|0 +community|acva:Comoros|0|0 +community|acva:Egypt_modern|0|0 +community|acva:InfluenceFromAncientEgypt|0|0 +community|acva:InfluenceFromByzantium|0|0 +community|acva:InfluenceFromChina|0|0 +community|acva:InfluenceFromGreece|0|0 +community|acva:InfluenceFromIslam|0|0 +community|acva:InfluenceFromPersia|0|0 +community|acva:InfluenceFromRome|0|0 +community|acva:Iraq|0|0 +community|acva:Islam_Education|0|0 +community|acva:Islam_branches_and_schools|0|0 +community|acva:Islamic_law_system|0|0 +community|acva:Jordan|0|0 +community|acva:Kuwait|0|0 +community|acva:Lebanon|0|0 +community|acva:Libya|0|0 +community|acva:Mauritania|0|0 +community|acva:Mesopotamia_civilization|0|0 +community|acva:Morocco|0|0 +community|acva:Oman|0|0 +community|acva:Palestine|0|0 +community|acva:Qatar|0|0 +community|acva:Saudi_Arabia|0|0 +community|acva:Somalia|0|0 +community|acva:Sudan|0|0 +community|acva:Syria|0|0 +community|acva:Tunisia|0|0 +community|acva:United_Arab_Emirates|0|0 +community|acva:Yemen|0|0 +community|acva:communication|0|0 +community|acva:computer_and_phone|0|0 +community|acva:daily_life|0|0 +community|acva:entertainment|0|0 +community|alghafa:mcq_exams_test_ar|0|0 +community|alghafa:meta_ar_dialects|0|0 +community|alghafa:meta_ar_msa|0|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_task|0|0 +community|alghafa:multiple_choice_sentiment_task|0|0 +community|race_ar|0|0 +community|piqa_ar|0|0 +community|arc_easy_ar|0|0 +community|arc_challenge_okapi_ar|0|0 +community|mmlu_okapi_ar|0|0 +community|openbook_qa_ext_ar|0|0 +community|boolq_ar|0|0 +community|copa_ext_ar|0|0 +community|hellaswag_okapi_ar|0|0 +community|toxigen_ar|0|0 +community|sciq_ar|0|0 +community|arabic_mmlu_ht:abstract_algebra|0|0 +community|arabic_mmlu_ht:anatomy|0|0 +community|arabic_mmlu_ht:astronomy|0|0 +community|arabic_mmlu_ht:business_ethics|0|0 +community|arabic_mmlu_ht:clinical_knowledge|0|0 +community|arabic_mmlu_ht:college_biology|0|0 +community|arabic_mmlu_ht:college_chemistry|0|0 +community|arabic_mmlu_ht:college_computer_science|0|0 +community|arabic_mmlu_ht:college_mathematics|0|0 +community|arabic_mmlu_ht:college_medicine|0|0 +community|arabic_mmlu_ht:college_physics|0|0 +community|arabic_mmlu_ht:computer_security|0|0 +community|arabic_mmlu_ht:conceptual_physics|0|0 +community|arabic_mmlu_ht:econometrics|0|0 +community|arabic_mmlu_ht:electrical_engineering|0|0 +community|arabic_mmlu_ht:elementary_mathematics|0|0 +community|arabic_mmlu_ht:formal_logic|0|0 +community|arabic_mmlu_ht:global_facts|0|0 +community|arabic_mmlu_ht:high_school_biology|0|0 +community|arabic_mmlu_ht:high_school_chemistry|0|0 +community|arabic_mmlu_ht:high_school_computer_science|0|0 +community|arabic_mmlu_ht:high_school_european_history|0|0 +community|arabic_mmlu_ht:high_school_geography|0|0 +community|arabic_mmlu_ht:high_school_government_and_politics|0|0 +community|arabic_mmlu_ht:high_school_macroeconomics|0|0 +community|arabic_mmlu_ht:high_school_mathematics|0|0 +community|arabic_mmlu_ht:high_school_microeconomics|0|0 +community|arabic_mmlu_ht:high_school_physics|0|0 +community|arabic_mmlu_ht:high_school_psychology|0|0 +community|arabic_mmlu_ht:high_school_statistics|0|0 +community|arabic_mmlu_ht:high_school_us_history|0|0 +community|arabic_mmlu_ht:high_school_world_history|0|0 +community|arabic_mmlu_ht:human_aging|0|0 +community|arabic_mmlu_ht:human_sexuality|0|0 +community|arabic_mmlu_ht:international_law|0|0 +community|arabic_mmlu_ht:jurisprudence|0|0 +community|arabic_mmlu_ht:logical_fallacies|0|0 +community|arabic_mmlu_ht:machine_learning|0|0 +community|arabic_mmlu_ht:management|0|0 +community|arabic_mmlu_ht:marketing|0|0 +community|arabic_mmlu_ht:medical_genetics|0|0 +community|arabic_mmlu_ht:miscellaneous|0|0 +community|arabic_mmlu_ht:moral_disputes|0|0 +community|arabic_mmlu_ht:moral_scenarios|0|0 +community|arabic_mmlu_ht:nutrition|0|0 +community|arabic_mmlu_ht:philosophy|0|0 +community|arabic_mmlu_ht:prehistory|0|0 +community|arabic_mmlu_ht:professional_accounting|0|0 +community|arabic_mmlu_ht:professional_law|0|0 +community|arabic_mmlu_ht:professional_medicine|0|0 +community|arabic_mmlu_ht:professional_psychology|0|0 +community|arabic_mmlu_ht:public_relations|0|0 +community|arabic_mmlu_ht:security_studies|0|0 +community|arabic_mmlu_ht:sociology|0|0 +community|arabic_mmlu_ht:us_foreign_policy|0|0 +community|arabic_mmlu_ht:virology|0|0 +community|arabic_mmlu_ht:world_religions|0|0 +community|arabic_mmlu:Islamic Studies|0|0 +community|arabic_mmlu:Islamic Studies (Middle School)|0|0 +community|arabic_mmlu:Islamic Studies (Primary School)|0|0 +community|arabic_mmlu:Islamic Studies (High School)|0|0 +community|arabic_mmlu:Driving Test|0|0 +community|arabic_mmlu:Natural Science (Middle School)|0|0 +community|arabic_mmlu:Natural Science (Primary School)|0|0 +community|arabic_mmlu:History (Middle School)|0|0 +community|arabic_mmlu:History (Primary School)|0|0 +community|arabic_mmlu:History (High School)|0|0 +community|arabic_mmlu:General Knowledge|0|0 +community|arabic_mmlu:General Knowledge (Middle School)|0|0 +community|arabic_mmlu:General Knowledge (Primary School)|0|0 +community|arabic_mmlu:Law (Professional)|0|0 +community|arabic_mmlu:Physics (High School)|0|0 +community|arabic_mmlu:Social Science (Middle School)|0|0 +community|arabic_mmlu:Social Science (Primary School)|0|0 +community|arabic_mmlu:Management (University)|0|0 +community|arabic_mmlu:Arabic Language (Middle School)|0|0 +community|arabic_mmlu:Arabic Language (Primary School)|0|0 +community|arabic_mmlu:Arabic Language (High School)|0|0 +community|arabic_mmlu:Political Science (University)|0|0 +community|arabic_mmlu:Philosophy (High School)|0|0 +community|arabic_mmlu:Accounting (University)|0|0 +community|arabic_mmlu:Computer Science (Middle School)|0|0 +community|arabic_mmlu:Computer Science (Primary School)|0|0 +community|arabic_mmlu:Computer Science (High School)|0|0 +community|arabic_mmlu:Computer Science (University)|0|0 +community|arabic_mmlu:Geography (Middle School)|0|0 +community|arabic_mmlu:Geography (Primary School)|0|0 +community|arabic_mmlu:Geography (High School)|0|0 +community|arabic_mmlu:Math (Primary School)|0|0 +community|arabic_mmlu:Biology (High School)|0|0 +community|arabic_mmlu:Economics (Middle School)|0|0 +community|arabic_mmlu:Economics (High School)|0|0 +community|arabic_mmlu:Economics (University)|0|0 +community|arabic_mmlu:Arabic Language (General)|0|0 +community|arabic_mmlu:Arabic Language (Grammar)|0|0 +community|arabic_mmlu:Civics (Middle School)|0|0 +community|arabic_mmlu:Civics (High School)|0|0 +community|madinah_qa:Arabic Language (General)|0|0 +community|madinah_qa:Arabic Language (Grammar)|0|0 +community|aratrust:Trustfulness|0|0 +community|aratrust:MentalHealth|0|0 +community|aratrust:PhysicalHealth|0|0 +community|aratrust:Offensive|0|0 +community|aratrust:Ethics|0|0 +community|aratrust:Privacy|0|0 +community|aratrust:Unfairness|0|0 +community|aratrust:Illegal|0|0 From b6d61dcf2a2628aab7d6ee1b710b20f2f0558593 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Thu, 31 Oct 2024 15:31:52 +0400 Subject: [PATCH 05/11] Update arabic_evals.py Fix formatting issues for --- community_tasks/arabic_evals.py | 74 +++++++++++++++------------------ 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 1f95396fb..f48724a60 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -54,38 +54,32 @@ def arabic_mmlu_pfn(line, task_name: str = None): - instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" - + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + # Define the mapping from Latin to Arabic letters - latin_to_arabic = { - 'A': 'أ', - 'B': 'ب', - 'C': 'ج', - 'D': 'د', - 'E': 'هـ' - } - + latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"} + # Create a list of valid choices with corresponding Arabic keys choices = [] valid_keys_latin = [] valid_keys_arabic = [] - + # Enumerate through the options and append the valid ones - for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']): + for idx, key in enumerate(["A", "B", "C", "D", "E"]): option = line.get(f"Option {idx + 1}") if option: # Check if option is not null choices.append(option) valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter - + # Find the correct index for the answer key in the Arabic version answer_index = valid_keys_latin.index(line["Answer Key"]) - + # Construct the query with Arabic letters query = f"{instruction}{line['Question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)]) query += "الإجابة:" - + return Doc( task_name=task_name, query=query, @@ -143,9 +137,9 @@ def __init__( def arabic_mmlu_ht_pfn(line, task_name: str = None): - instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" choices = line["choices"] - answer_index = line["answer"] # It is an int reflecting the index of correct answer in line["choices"] + answer_index = line["answer"] # It is an int reflecting the index of correct answer in line["choices"] query = f"{instruction}{line['question']}\n" query += "".join([f"{idx}. {choice}\n" for idx, choice in enumerate(choices, start=1)]) @@ -207,11 +201,13 @@ def __init__( def arabic_mmlu_mt_pfn(line, task_name: str = None): - instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" choices = [line["A"], line["B"], line["C"], line["D"]] # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, # it will then be applied to arabic letters - answer_index = LETTER_INDICES.index(line["answer"]) # line["answer"] is the correct answer. That's why we need to index it ! + answer_index = LETTER_INDICES.index( + line["answer"] + ) # line["answer"] is the correct answer. That's why we need to index it ! query = f"{instruction}{line['question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) @@ -259,11 +255,13 @@ def __init__( def arabic_mmmlu_pfn(line, task_name: str = None): - instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" choices = [line["A"], line["B"], line["C"], line["D"]] # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, # it will then be applied to arabic letters - answer_index = LETTER_INDICES.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it ! + answer_index = LETTER_INDICES.index( + line["Answer"] + ) # line["answer"] is the correct answer. That's why we need to index it ! query = f"{instruction}{line['Question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) @@ -361,11 +359,13 @@ def __init__( def aratrust_pfn(line, task_name: str = None): - instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n" + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n" choices = [line["A"], line["B"], line["C"]] # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, # it will then be applied to arabic letters - answer_index = LETTER_INDICES_AR.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it ! + answer_index = LETTER_INDICES_AR.index( + line["Answer"] + ) # line["answer"] is the correct answer. That's why we need to index it ! query = f"{instruction}{line['Question']}\n" query += "".join([f"{choice}\n" for choice in choices]) @@ -392,7 +392,9 @@ def __init__( hf_subset=hf_subset, prompt_function=aratrust_pfn, hf_repo="asas-ai/AraTrust-categorized", - metric=[Metrics.f1_score], # Following the paper (AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic)[https://arxiv.org/abs/2403.09017] + metric=[ + Metrics.f1_score + ], # Following the paper (AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic)[https://arxiv.org/abs/2403.09017] hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split=None, @@ -407,9 +409,7 @@ def __init__( ) -ARATRUST_TASKS = [ - CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS -] +ARATRUST_TASKS = [CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS] def arabic_exams_pfn(line, task_name: str = None): @@ -834,30 +834,24 @@ def sciq_arabic_pfn(line, task_name: str = None): def madinah_qa_pfn(line, task_name: str = None): - instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" - + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + # Define the mapping from Latin to Arabic letters - latin_to_arabic = { - 'A': 'أ', - 'B': 'ب', - 'C': 'ج', - 'D': 'د', - 'E': 'هـ' - } - + latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"} + # Create a list of valid choices with corresponding Arabic keys choices = [] valid_keys_latin = [] valid_keys_arabic = [] - + # Enumerate through the options and append the valid ones - for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']): + for idx, key in enumerate(["A", "B", "C", "D", "E"]): option = line.get(f"Option {idx + 1}") if option: # Check if option is not null choices.append(option) valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter - + # Find the correct index for the answer key in the Arabic version answer_index = valid_keys_latin.index(line["Answer Key"]) From 91aa0e18267917574fab0c501b3b935ddd13f61b Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:00:20 +0400 Subject: [PATCH 06/11] Update all_arabic_tasks.txt Add missing task: OpenAI's MMMLU arabic subset --- examples/tasks/all_arabic_tasks.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt index 8593fa2f8..8b62f61b3 100644 --- a/examples/tasks/all_arabic_tasks.txt +++ b/examples/tasks/all_arabic_tasks.txt @@ -231,6 +231,7 @@ community|arabic_mmlu:Economics (University)|0|0 community|arabic_mmlu:Arabic Language (General)|0|0 community|arabic_mmlu:Arabic Language (Grammar)|0|0 community|arabic_mmlu:Civics (Middle School)|0|0 +community|arabic_mmmlu|0|0 community|arabic_mmlu:Civics (High School)|0|0 community|madinah_qa:Arabic Language (General)|0|0 community|madinah_qa:Arabic Language (Grammar)|0|0 From 7e163e2974b7eee077379bd8865938a46ff9cce1 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:01:38 +0400 Subject: [PATCH 07/11] Update all_arabic_tasks.txt Correct order --- examples/tasks/all_arabic_tasks.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt index 8b62f61b3..fbb575143 100644 --- a/examples/tasks/all_arabic_tasks.txt +++ b/examples/tasks/all_arabic_tasks.txt @@ -231,8 +231,8 @@ community|arabic_mmlu:Economics (University)|0|0 community|arabic_mmlu:Arabic Language (General)|0|0 community|arabic_mmlu:Arabic Language (Grammar)|0|0 community|arabic_mmlu:Civics (Middle School)|0|0 -community|arabic_mmmlu|0|0 community|arabic_mmlu:Civics (High School)|0|0 +community|arabic_mmmlu|0|0 community|madinah_qa:Arabic Language (General)|0|0 community|madinah_qa:Arabic Language (Grammar)|0|0 community|aratrust:Trustfulness|0|0 From aa201d25fd62e49a01eacd797e03e261bed51cfd Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:55:26 +0400 Subject: [PATCH 08/11] Update arabic_evals.py remove openai mmmlu task following the discussion here: https://github.com/huggingface/lighteval/pull/372 --- community_tasks/arabic_evals.py | 41 --------------------------------- 1 file changed, 41 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index f48724a60..ed284d838 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -254,46 +254,6 @@ def __init__( ] -def arabic_mmmlu_pfn(line, task_name: str = None): - instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" - choices = [line["A"], line["B"], line["C"], line["D"]] - # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, - # it will then be applied to arabic letters - answer_index = LETTER_INDICES.index( - line["Answer"] - ) # line["answer"] is the correct answer. That's why we need to index it ! - - query = f"{instruction}{line['Question']}\n" - query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) - query += "الإجابة:" - - return Doc( - task_name=task_name, - query=query, - choices=LETTER_INDICES_AR[:4], - gold_index=answer_index, - instruction=instruction, - target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], - ) - - -# ARABIC MMMLU (OpenAI) ## -arabic_mmmlu_task = LightevalTaskConfig( - name="arabic_mmmlu", - prompt_function=arabic_mmmlu_pfn, - suite=["community"], - hf_repo="openai/MMMLU", - hf_subset="AR_XY", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - metric=[Metrics.loglikelihood_acc_norm], - trust_dataset=True, - version=0, -) - - # ACVA ## # fmt: off ACVA_SUBSETS = [ @@ -904,7 +864,6 @@ def __init__( ARABIC_MMLU_TASKS + ARABIC_MMLU_HT_TASKS + ARABIC_MMLU_MT_TASKS - + [arabic_mmmlu_task] + ACVA_TASKS + ALGHAFA_TASKS + ARATRUST_TASKS From 81255aee450a2dd62d6ce15b895769208936f51c Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:56:13 +0400 Subject: [PATCH 09/11] Update all_arabic_tasks.txt remove openai mmmlu task following the discussion here: https://github.com/huggingface/lighteval/pull/372 --- examples/tasks/all_arabic_tasks.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt index fbb575143..8593fa2f8 100644 --- a/examples/tasks/all_arabic_tasks.txt +++ b/examples/tasks/all_arabic_tasks.txt @@ -232,7 +232,6 @@ community|arabic_mmlu:Arabic Language (General)|0|0 community|arabic_mmlu:Arabic Language (Grammar)|0|0 community|arabic_mmlu:Civics (Middle School)|0|0 community|arabic_mmlu:Civics (High School)|0|0 -community|arabic_mmmlu|0|0 community|madinah_qa:Arabic Language (General)|0|0 community|madinah_qa:Arabic Language (Grammar)|0|0 community|aratrust:Trustfulness|0|0 From b8869cbee3c41e0c7c0667bf5f1623a0a21e3bf0 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 29 Nov 2024 12:50:38 +0400 Subject: [PATCH 10/11] Update tasks.py Adding a templated version of arabic mmlu based on @hynky1999 request in the #372 PR --- src/lighteval/tasks/multilingual/tasks.py | 38 +++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index c788871ba..e41cc4615 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -60,6 +60,8 @@ from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro +# Import for "arabic_mmlu_templated_tasks" +from lighteval.tasks.requests import Doc TASKS_TABLE = [] # ------------------------------- NLI Tasks ------------------------------- # @@ -2031,6 +2033,42 @@ ] ] + +# definition of templated version of arabic_mmlu +arabic_mmlu_templated_tasks = [ + LightevalTaskConfig( + name=f"templated_mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}", + prompt_function=get_mcq_prompt_function( + Language.ARABIC, + lambda line: { + "instruction": "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:", + "context": line["Context"], + "question": line["Question"], + "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o], + "gold_idx": LETTER_INDICES.index(line["Answer Key"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="MBZUAI/ArabicMMLU", + hf_subset=subset, + evaluation_splits=("test",), + hf_avail_splits=["dev"], + metric=get_metrics_for_formulation( + formulation, + [ + loglikelihood_acc_metric(normalization=LogProbTokenNorm()), + loglikelihood_acc_metric(normalization=LogProbCharNorm()), + ], + ), + ) + for subset in ARABIC_MMLU_SUBSETS + for formulation in [ + MCFFormulation("NativeLetters"), + ] +] + + TURKISH_MMLU_SUBSET = [ "Biology", "Chemistry", From bdb2867d9ccd2758c3814f7c09cd2a1b40cf835d Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:53:38 +0400 Subject: [PATCH 11/11] Update tasks.py remove arabic_mmlu_templated_tasks --- src/lighteval/tasks/multilingual/tasks.py | 37 ----------------------- 1 file changed, 37 deletions(-) diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index e41cc4615..899d6d0c7 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -60,8 +60,6 @@ from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro -# Import for "arabic_mmlu_templated_tasks" -from lighteval.tasks.requests import Doc TASKS_TABLE = [] # ------------------------------- NLI Tasks ------------------------------- # @@ -2034,41 +2032,6 @@ ] -# definition of templated version of arabic_mmlu -arabic_mmlu_templated_tasks = [ - LightevalTaskConfig( - name=f"templated_mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}", - prompt_function=get_mcq_prompt_function( - Language.ARABIC, - lambda line: { - "instruction": "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:", - "context": line["Context"], - "question": line["Question"], - "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o], - "gold_idx": LETTER_INDICES.index(line["Answer Key"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="MBZUAI/ArabicMMLU", - hf_subset=subset, - evaluation_splits=("test",), - hf_avail_splits=["dev"], - metric=get_metrics_for_formulation( - formulation, - [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - ], - ), - ) - for subset in ARABIC_MMLU_SUBSETS - for formulation in [ - MCFFormulation("NativeLetters"), - ] -] - - TURKISH_MMLU_SUBSET = [ "Biology", "Chemistry",