From f3010c17c3c111f2786f37511ec0ebc0941d659e Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Wed, 23 Oct 2024 11:55:53 +0400
Subject: [PATCH 01/11] Update arabic_evals.py

Add new Arabic benchmarks and update existing tasks

- Renamed `arabic_mmlu` to `arabic_mmlu_mt` to highlight its machine-translated origin.
- Added new benchmarks: `arabic_mmlu` ArabicMMLU (https://arxiv.org/abs/2402.12840), `arabic_mmlu_ht` (human-translated), and `MadinahQA` from MBZUAI. As well as `arabic_mmmlu` (OpenAI MMMLU), and `AraTrust` a trustworthiness benchmark for Arabic LLMs (https://arxiv.org/abs/2403.09017).
- Enhanced prompt functions for better flexibility in answer options.
---
 community_tasks/arabic_evals.py | 403 +++++++++++++++++++++++++++++---
 1 file changed, 368 insertions(+), 35 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 323120cd7..1f95396fb 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -39,9 +39,97 @@
 LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"]
 # fmt: on
 
-# ARABIC MMLU ##
+# ArabicMMLU
 # fmt: off
 ARABIC_MMLU_SUBSETS = [
+    "All", "Islamic Studies", "Islamic Studies (Middle School)", "Islamic Studies (Primary School)", "Islamic Studies (High School)", "Driving Test",
+    "Natural Science (Middle School)", "Natural Science (Primary School)", "History (Middle School)", "History (Primary School)", "History (High School)", "General Knowledge",
+    "General Knowledge (Middle School)", "General Knowledge (Primary School)", "Law (Professional)", "Physics (High School)", "Social Science (Middle School)",
+    "Social Science (Primary School)", "Management (University)", "Arabic Language (Middle School)", "Arabic Language (Primary School)", "Arabic Language (High School)", "Political Science (University)",
+    "Philosophy (High School)", "Accounting (University)", "Computer Science (Middle School)", "Computer Science (Primary School)", "Computer Science (High School)", "Computer Science (University)",
+    "Geography (Middle School)", "Geography (Primary School)", "Geography (High School)", "Math (Primary School)", "Biology (High School)", "Economics (Middle School)",
+    "Economics (High School)", "Economics (University)", "Arabic Language (General)", "Arabic Language (Grammar)", "Civics (Middle School)", "Civics (High School)"
+]
+# fmt: on
+
+
+def arabic_mmlu_pfn(line, task_name: str = None):
+    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
+    
+    # Define the mapping from Latin to Arabic letters
+    latin_to_arabic = {
+        'A': 'أ',
+        'B': 'ب',
+        'C': 'ج',
+        'D': 'د',
+        'E': 'هـ'
+    }
+    
+    # Create a list of valid choices with corresponding Arabic keys
+    choices = []
+    valid_keys_latin = []
+    valid_keys_arabic = []
+    
+    # Enumerate through the options and append the valid ones
+    for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']):
+        option = line.get(f"Option {idx + 1}")
+        if option:  # Check if option is not null
+            choices.append(option)
+            valid_keys_latin.append(key)  # Append the Latin key (A, B, C, D, E)
+            valid_keys_arabic.append(latin_to_arabic[key])  # Append the corresponding Arabic letter
+    
+    # Find the correct index for the answer key in the Arabic version
+    answer_index = valid_keys_latin.index(line["Answer Key"])
+    
+    # Construct the query with Arabic letters
+    query = f"{instruction}{line['Question']}\n"
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)])
+    query += "الإجابة:"
+    
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=valid_keys_arabic,  # Return only valid choices (Arabic keys)
+        gold_index=answer_index,  # Correct index in the valid Arabic keys
+        instruction=instruction,
+        target_for_fewshot_sorting=valid_keys_arabic[answer_index],  # Correct answer in Arabic form
+    )
+
+
+class CustomArabicMMLUTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function=arabic_mmlu_pfn,
+            hf_repo="MBZUAI/ArabicMMLU",
+            metric=[Metrics.loglikelihood_acc_norm],
+            hf_avail_splits=["test"],
+            evaluation_splits=["test"],
+            few_shots_split=["dev"],
+            few_shots_select="sequential",
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=None,
+            output_regex=None,
+            frozen=False,
+            trust_dataset=True,
+            version=0,
+        )
+
+
+ARABIC_MMLU_TASKS = [
+    CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS
+]
+
+
+# ARABIC MMLU HT ##
+# fmt: off
+ARABIC_MMLU_HT_SUBSETS = [
     "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science",
     "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering",
     "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science",
@@ -54,13 +142,76 @@
 # fmt: on
 
 
-def mmlu_arabic(line, task_name: str = None):
-    topic = line["subject"]
-    instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n"
+def arabic_mmlu_ht_pfn(line, task_name: str = None):
+    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
+    choices = line["choices"]
+    answer_index = line["answer"] # It is an int reflecting the index of correct answer in line["choices"]
+
+    query = f"{instruction}{line['question']}\n"
+    query += "".join([f"{idx}. {choice}\n" for idx, choice in enumerate(choices, start=1)])
+    query += "الإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=[str(i) for i in range(1, len(choices) + 1)],  # List of strings instead of ints
+        gold_index=answer_index,
+        instruction=instruction,
+        target_for_fewshot_sorting=str(answer_index),  # Assuming it's sorted based on the number
+    )
+
+
+class CustomArabicMMLUHTTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function=arabic_mmlu_ht_pfn,
+            hf_repo="MBZUAI/human_translated_arabic_mmlu",
+            metric=[Metrics.loglikelihood_acc_norm],
+            hf_avail_splits=["test"],
+            evaluation_splits=["test"],
+            few_shots_split=None,
+            few_shots_select=None,
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=None,
+            output_regex=None,
+            frozen=False,
+            trust_dataset=True,
+            version=0,
+        )
+
+
+ARABIC_MMLU_HT_TASKS = [
+    CustomArabicMMLUHTTask(name=f"arabic_mmlu_ht:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_HT_SUBSETS
+]
+
+# ARABIC MMLU MT ##
+# fmt: off
+ARABIC_MMLU_MT_SUBSETS = [
+    "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science",
+    "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering",
+    "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science",
+    "high_school_european_history", "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics", "high_school_mathematics",
+    "high_school_microeconomics", "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history", "high_school_world_history",
+    "human_aging", "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing", "medical_genetics",
+    "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting", "professional_law",
+    "professional_medicine", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"
+]
+# fmt: on
+
+
+def arabic_mmlu_mt_pfn(line, task_name: str = None):
+    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n"
     choices = [line["A"], line["B"], line["C"], line["D"]]
     # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
     # it will then be applied to arabic letters
-    gold_ix = LETTER_INDICES.index(line["answer"])
+    answer_index = LETTER_INDICES.index(line["answer"]) # line["answer"] is the correct answer. That's why we need to index it !
 
     query = f"{instruction}{line['question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)])
@@ -70,13 +221,13 @@ def mmlu_arabic(line, task_name: str = None):
         task_name=task_name,
         query=query,
         choices=LETTER_INDICES_AR[:4],
-        gold_index=gold_ix,
+        gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix],
+        target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
     )
 
 
-class CustomArabicMMLUTask(LightevalTaskConfig):
+class CustomArabicMMLUMTTask(LightevalTaskConfig):
     def __init__(
         self,
         name,
@@ -85,7 +236,7 @@ def __init__(
         super().__init__(
             name=name,
             hf_subset=hf_subset,
-            prompt_function=mmlu_arabic,
+            prompt_function=arabic_mmlu_mt_pfn,
             hf_repo="OALL/Arabic_MMLU",
             metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "dev"],
@@ -102,10 +253,49 @@ def __init__(
         )
 
 
-ARABIC_MMLU_TASKS = [
-    CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS
+ARABIC_MMLU_MT_TASKS = [
+    CustomArabicMMLUMTTask(name=f"arabic_mmlu_mt:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_MT_SUBSETS
 ]
 
+
+def arabic_mmmlu_pfn(line, task_name: str = None):
+    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n"
+    choices = [line["A"], line["B"], line["C"], line["D"]]
+    # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
+    # it will then be applied to arabic letters
+    answer_index = LETTER_INDICES.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it !
+
+    query = f"{instruction}{line['Question']}\n"
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)])
+    query += "الإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=LETTER_INDICES_AR[:4],
+        gold_index=answer_index,
+        instruction=instruction,
+        target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
+    )
+
+
+# ARABIC MMMLU (OpenAI) ##
+arabic_mmmlu_task = LightevalTaskConfig(
+    name="arabic_mmmlu",
+    prompt_function=arabic_mmmlu_pfn,
+    suite=["community"],
+    hf_repo="openai/MMMLU",
+    hf_subset="AR_XY",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    metric=[Metrics.loglikelihood_acc_norm],
+    trust_dataset=True,
+    version=0,
+)
+
+
 # ACVA ##
 # fmt: off
 ACVA_SUBSETS = [
@@ -121,7 +311,7 @@ def __init__(
 # fmt: on
 
 
-def acva(line, task_name: str = None):
+def acva_pfn(line, task_name: str = None):
     question = line["question"]
     answer = line["answer"]
 
@@ -142,7 +332,7 @@ def __init__(
         super().__init__(
             name=name,
             hf_subset=hf_subset,
-            prompt_function=acva,
+            prompt_function=acva_pfn,
             hf_repo="OALL/ACVA",
             metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
@@ -162,7 +352,67 @@ def __init__(
 ACVA_TASKS = [CustomACVATask(name=f"acva:{subset}", hf_subset=subset) for subset in ACVA_SUBSETS]
 
 
-def arabic_exams(line, task_name: str = None):
+# AraTrust ##
+# fmt: off
+ARATRUST_SUBSETS = [
+    "Trustfulness", "MentalHealth", "PhysicalHealth", "Offensive", "Ethics", "Privacy", "Unfairness", "Illegal",
+]
+# fmt: on
+
+
+def aratrust_pfn(line, task_name: str = None):
+    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n"
+    choices = [line["A"], line["B"], line["C"]]
+    # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
+    # it will then be applied to arabic letters
+    answer_index = LETTER_INDICES_AR.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it !
+
+    query = f"{instruction}{line['Question']}\n"
+    query += "".join([f"{choice}\n" for choice in choices])
+    query += "الإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=LETTER_INDICES_AR[:3],
+        gold_index=answer_index,
+        instruction=instruction,
+        target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
+    )
+
+
+class CustomAraTrustTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function=aratrust_pfn,
+            hf_repo="asas-ai/AraTrust-categorized",
+            metric=[Metrics.f1_score], # Following the paper (AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic)[https://arxiv.org/abs/2403.09017]
+            hf_avail_splits=["train"],
+            evaluation_splits=["train"],
+            few_shots_split=None,
+            few_shots_select=None,
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=[],
+            output_regex=None,
+            frozen=False,
+            trust_dataset=True,
+            version=0,
+        )
+
+
+ARATRUST_TASKS = [
+    CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS
+]
+
+
+def arabic_exams_pfn(line, task_name: str = None):
     topic = line["subject"]
     question = line["question"]
     choices = [line["A"], line["B"], line["C"], line["D"]]
@@ -188,7 +438,7 @@ def arabic_exams(line, task_name: str = None):
 # ARABIC EXAMS ##
 arabic_exams_task = LightevalTaskConfig(
     name="arabic_exams",
-    prompt_function=arabic_exams,
+    prompt_function=arabic_exams_pfn,
     suite=["community"],
     hf_repo="OALL/Arabic_EXAMS",
     hf_subset="default",
@@ -212,7 +462,7 @@ def arabic_exams(line, task_name: str = None):
 # fmt: on
 
 
-def alghafa_prompt(line, task_name: str = None):
+def alghafa_pfn(line, task_name: str = None):
     question = line["query"]
     answer_index = int(line["label"])
     # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label'
@@ -244,7 +494,7 @@ def __init__(
         super().__init__(
             name=name,
             hf_subset=hf_subset,
-            prompt_function=alghafa_prompt,
+            prompt_function=alghafa_pfn,
             hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
             metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
@@ -256,6 +506,7 @@ def __init__(
             stop_sequence=None,
             output_regex=None,
             frozen=False,
+            trust_dataset=True,
             version=0,
         )
 
@@ -266,7 +517,7 @@ def __init__(
 # race_ar
 race_ar_task = LightevalTaskConfig(
     name="race_ar",
-    prompt_function=alghafa_prompt,
+    prompt_function=alghafa_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="race_ar",
@@ -283,7 +534,7 @@ def __init__(
 # piqa_ar
 piqa_ar_task = LightevalTaskConfig(
     name="piqa_ar",
-    prompt_function=alghafa_prompt,
+    prompt_function=alghafa_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="piqa_ar",
@@ -300,7 +551,7 @@ def __init__(
 # arc_easy_ar
 arc_easy_ar_task = LightevalTaskConfig(
     name="arc_easy_ar",
-    prompt_function=alghafa_prompt,
+    prompt_function=alghafa_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="arc_easy_ar",
@@ -317,7 +568,7 @@ def __init__(
 # arc_challenge_okapi_ar
 arc_challenge_okapi_ar_task = LightevalTaskConfig(
     name="arc_challenge_okapi_ar",
-    prompt_function=alghafa_prompt,
+    prompt_function=alghafa_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="arc_challenge_okapi_ar",
@@ -334,7 +585,7 @@ def __init__(
 # mmlu_okapi_ar
 mmlu_okapi_ar_task = LightevalTaskConfig(
     name="mmlu_okapi_ar",
-    prompt_function=alghafa_prompt,
+    prompt_function=alghafa_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="mmlu_okapi_ar",
@@ -351,7 +602,7 @@ def __init__(
 # openbook_qa_ext_ar
 openbook_qa_ext_ar_task = LightevalTaskConfig(
     name="openbook_qa_ext_ar",
-    prompt_function=alghafa_prompt,
+    prompt_function=alghafa_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="openbook_qa_ext_ar",
@@ -366,9 +617,7 @@ def __init__(
 
 
 # boolq_ar
-
-
-def boolq_prompt_arabic(line, task_name: str = None):
+def boolq_arabic_pfn(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
     answer = "نعم" if line["answer"] else "لا"
@@ -393,7 +642,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
 
 boolq_ar_task = LightevalTaskConfig(
     name="boolq_ar",
-    prompt_function=boolq_prompt_arabic,
+    prompt_function=boolq_arabic_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="boolq_ar",
@@ -408,7 +657,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
 
 
 # copa_ext_ar
-def copa_prompt_arabic(line, task_name: str = None):
+def copa_arabic_pfn(line, task_name: str = None):
     premise = line["premise"]
     choices = [line["choice1"], line["choice2"]]
     question_map = {"cause": "لأن", "effect": "لذلك"}
@@ -429,7 +678,7 @@ def copa_prompt_arabic(line, task_name: str = None):
 
 copa_ext_ar_task = LightevalTaskConfig(
     name="copa_ext_ar",
-    prompt_function=copa_prompt_arabic,
+    prompt_function=copa_arabic_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="copa_ext_ar",
@@ -444,7 +693,7 @@ def copa_prompt_arabic(line, task_name: str = None):
 
 
 # hellaswag_okapi_ar
-def hellaswag_prompt_arabic(line, task_name: str = None):
+def hellaswag_arabic_pfn(line, task_name: str = None):
     ctx = re.sub(r"\[.*?\]", "", line["ctx"])  # Remove latin words within brackets
     endings = [
         re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])
@@ -474,7 +723,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
 
 hellaswag_okapi_ar_task = LightevalTaskConfig(
     name="hellaswag_okapi_ar",
-    prompt_function=hellaswag_prompt_arabic,
+    prompt_function=hellaswag_arabic_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="hellaswag_okapi_ar",
@@ -489,7 +738,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
 
 
 # toxigen_ar
-def toxigen_prompt_arabic(line, task_name: str = None):
+def toxigen_arabic_pfn(line, task_name: str = None):
     text = line["text"]
     label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0
     instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".'
@@ -512,7 +761,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
 
 toxigen_ar_task = LightevalTaskConfig(
     name="toxigen_ar",
-    prompt_function=toxigen_prompt_arabic,
+    prompt_function=toxigen_arabic_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="toxigen_ar",
@@ -527,7 +776,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
 
 
 # sciq_ar
-def sciq_prompt_arabic(line, task_name: str = None):
+def sciq_arabic_pfn(line, task_name: str = None):
     support = line["support"]
     question = line["question"]
     correct_answer = line["correct_answer"]
@@ -564,7 +813,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
 
 sciq_ar_task = LightevalTaskConfig(
     name="sciq_ar",
-    prompt_function=sciq_prompt_arabic,
+    prompt_function=sciq_arabic_pfn,
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="sciq_ar",
@@ -578,10 +827,94 @@ def sciq_prompt_arabic(line, task_name: str = None):
 )
 
 
+# madinah_qa
+# fmt: off
+MADINAH_QA_SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"]
+# fmt: on
+
+
+def madinah_qa_pfn(line, task_name: str = None):
+    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
+    
+    # Define the mapping from Latin to Arabic letters
+    latin_to_arabic = {
+        'A': 'أ',
+        'B': 'ب',
+        'C': 'ج',
+        'D': 'د',
+        'E': 'هـ'
+    }
+    
+    # Create a list of valid choices with corresponding Arabic keys
+    choices = []
+    valid_keys_latin = []
+    valid_keys_arabic = []
+    
+    # Enumerate through the options and append the valid ones
+    for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']):
+        option = line.get(f"Option {idx + 1}")
+        if option:  # Check if option is not null
+            choices.append(option)
+            valid_keys_latin.append(key)  # Append the Latin key (A, B, C, D, E)
+            valid_keys_arabic.append(latin_to_arabic[key])  # Append the corresponding Arabic letter
+    
+    # Find the correct index for the answer key in the Arabic version
+    answer_index = valid_keys_latin.index(line["Answer Key"])
+
+    query = f"{instruction}{line['Question']}\n"
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)])
+    query += "الإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=choices,
+        gold_index=answer_index,  # Correct index in the valid keys
+        instruction=instruction,
+        target_for_fewshot_sorting=valid_keys_latin[answer_index],  # Correct answer in Latin form
+    )
+
+
+class CustomMadinahQATask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function=madinah_qa_pfn,
+            hf_repo="MBZUAI/MadinahQA",
+            metric=[Metrics.loglikelihood_acc_norm],
+            hf_avail_splits=["test"],
+            evaluation_splits=["test"],
+            few_shots_split=["dev"],
+            few_shots_select="sequential",
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=None,
+            output_regex=None,
+            frozen=False,
+            trust_dataset=True,
+            version=0,
+        )
+
+
+MADINAH_QA_TASKS = [
+    CustomMadinahQATask(name=f"madinah_qa:{subset}", hf_subset=subset) for subset in MADINAH_QA_SUBSETS
+]
+
+
 TASKS_TABLE = (
     ARABIC_MMLU_TASKS
+    + ARABIC_MMLU_HT_TASKS
+    + ARABIC_MMLU_MT_TASKS
+    + [arabic_mmmlu_task]
     + ACVA_TASKS
     + ALGHAFA_TASKS
+    + ARATRUST_TASKS
+    + MADINAH_QA_TASKS
     + [arabic_exams_task]
     + [race_ar_task]
     + [piqa_ar_task]

From 64d4e116d86e796d3a59e32293f60e27e6cc1904 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Wed, 23 Oct 2024 11:57:51 +0400
Subject: [PATCH 02/11] Update and rename OALL_tasks.txt to OALL_v1_tasks.txt

Rename file to refelect that it is v1 leaderboard tasks
---
 examples/tasks/OALL_tasks.txt    | 136 -------------------------------
 examples/tasks/OALL_v1_tasks.txt | 136 +++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+), 136 deletions(-)
 delete mode 100644 examples/tasks/OALL_tasks.txt
 create mode 100644 examples/tasks/OALL_v1_tasks.txt

diff --git a/examples/tasks/OALL_tasks.txt b/examples/tasks/OALL_tasks.txt
deleted file mode 100644
index 346d062c6..000000000
--- a/examples/tasks/OALL_tasks.txt
+++ /dev/null
@@ -1,136 +0,0 @@
-lighteval|xstory_cloze:ar|0|0
-community|arabic_mmlu:abstract_algebra|5|1
-community|arabic_mmlu:anatomy|5|1
-community|arabic_mmlu:astronomy|5|1
-community|arabic_mmlu:business_ethics|5|1
-community|arabic_mmlu:clinical_knowledge|5|1
-community|arabic_mmlu:college_biology|5|1
-community|arabic_mmlu:college_chemistry|5|1
-community|arabic_mmlu:college_computer_science|5|1
-community|arabic_mmlu:college_mathematics|5|1
-community|arabic_mmlu:college_medicine|5|1
-community|arabic_mmlu:college_physics|5|1
-community|arabic_mmlu:computer_security|5|1
-community|arabic_mmlu:conceptual_physics|5|1
-community|arabic_mmlu:econometrics|5|1
-community|arabic_mmlu:electrical_engineering|5|1
-community|arabic_mmlu:elementary_mathematics|5|1
-community|arabic_mmlu:formal_logic|5|1
-community|arabic_mmlu:global_facts|5|1
-community|arabic_mmlu:high_school_biology|5|1
-community|arabic_mmlu:high_school_chemistry|5|1
-community|arabic_mmlu:high_school_computer_science|5|1
-community|arabic_mmlu:high_school_european_history|5|1
-community|arabic_mmlu:high_school_geography|5|1
-community|arabic_mmlu:high_school_government_and_politics|5|1
-community|arabic_mmlu:high_school_macroeconomics|5|1
-community|arabic_mmlu:high_school_mathematics|5|1
-community|arabic_mmlu:high_school_microeconomics|5|1
-community|arabic_mmlu:high_school_physics|5|1
-community|arabic_mmlu:high_school_psychology|5|1
-community|arabic_mmlu:high_school_statistics|5|1
-community|arabic_mmlu:high_school_us_history|5|1
-community|arabic_mmlu:high_school_world_history|5|1
-community|arabic_mmlu:human_aging|5|1
-community|arabic_mmlu:human_sexuality|5|1
-community|arabic_mmlu:international_law|5|1
-community|arabic_mmlu:jurisprudence|5|1
-community|arabic_mmlu:logical_fallacies|5|1
-community|arabic_mmlu:machine_learning|5|1
-community|arabic_mmlu:management|5|1
-community|arabic_mmlu:marketing|5|1
-community|arabic_mmlu:medical_genetics|5|1
-community|arabic_mmlu:miscellaneous|5|1
-community|arabic_mmlu:moral_disputes|5|1
-community|arabic_mmlu:moral_scenarios|5|1
-community|arabic_mmlu:nutrition|5|1
-community|arabic_mmlu:philosophy|5|1
-community|arabic_mmlu:prehistory|5|1
-community|arabic_mmlu:professional_accounting|5|1
-community|arabic_mmlu:professional_law|5|1
-community|arabic_mmlu:professional_medicine|5|1
-community|arabic_mmlu:professional_psychology|5|1
-community|arabic_mmlu:public_relations|5|1
-community|arabic_mmlu:security_studies|5|1
-community|arabic_mmlu:sociology|5|1
-community|arabic_mmlu:us_foreign_policy|5|1
-community|arabic_mmlu:virology|5|1
-community|arabic_mmlu:world_religions|5|1
-community|arabic_exams|5|1
-community|acva:Algeria|5|1
-community|acva:Ancient_Egypt|5|1
-community|acva:Arab_Empire|5|1
-community|acva:Arabic_Architecture|5|1
-community|acva:Arabic_Art|5|1
-community|acva:Arabic_Astronomy|5|1
-community|acva:Arabic_Calligraphy|5|1
-community|acva:Arabic_Ceremony|5|1
-community|acva:Arabic_Clothing|5|1
-community|acva:Arabic_Culture|5|1
-community|acva:Arabic_Food|5|1
-community|acva:Arabic_Funeral|5|1
-community|acva:Arabic_Geography|5|1
-community|acva:Arabic_History|5|1
-community|acva:Arabic_Language_Origin|5|1
-community|acva:Arabic_Literature|5|1
-community|acva:Arabic_Math|5|1
-community|acva:Arabic_Medicine|5|1
-community|acva:Arabic_Music|5|1
-community|acva:Arabic_Ornament|5|1
-community|acva:Arabic_Philosophy|5|1
-community|acva:Arabic_Physics_and_Chemistry|5|1
-community|acva:Arabic_Wedding|5|1
-community|acva:Bahrain|5|1
-community|acva:Comoros|5|1
-community|acva:Egypt_modern|5|1
-community|acva:InfluenceFromAncientEgypt|5|1
-community|acva:InfluenceFromByzantium|5|1
-community|acva:InfluenceFromChina|5|1
-community|acva:InfluenceFromGreece|5|1
-community|acva:InfluenceFromIslam|5|1
-community|acva:InfluenceFromPersia|5|1
-community|acva:InfluenceFromRome|5|1
-community|acva:Iraq|5|1
-community|acva:Islam_Education|5|1
-community|acva:Islam_branches_and_schools|5|1
-community|acva:Islamic_law_system|5|1
-community|acva:Jordan|5|1
-community|acva:Kuwait|5|1
-community|acva:Lebanon|5|1
-community|acva:Libya|5|1
-community|acva:Mauritania|5|1
-community|acva:Mesopotamia_civilization|5|1
-community|acva:Morocco|5|1
-community|acva:Oman|5|1
-community|acva:Palestine|5|1
-community|acva:Qatar|5|1
-community|acva:Saudi_Arabia|5|1
-community|acva:Somalia|5|1
-community|acva:Sudan|5|1
-community|acva:Syria|5|1
-community|acva:Tunisia|5|1
-community|acva:United_Arab_Emirates|5|1
-community|acva:Yemen|5|1
-community|acva:communication|5|1
-community|acva:computer_and_phone|5|1
-community|acva:daily_life|5|1
-community|acva:entertainment|5|1
-community|alghafa:mcq_exams_test_ar|5|1
-community|alghafa:meta_ar_dialects|5|1
-community|alghafa:meta_ar_msa|5|1
-community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
-community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1
-community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
-community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
-community|alghafa:multiple_choice_rating_sentiment_task|5|1
-community|alghafa:multiple_choice_sentiment_task|5|1
-community|race_ar|5|1
-community|piqa_ar|5|1
-community|arc_easy_ar|5|1
-community|arc_challenge_okapi_ar|5|1
-community|openbook_qa_ext_ar|5|1
-community|boolq_ar|5|1
-community|copa_ext_ar|5|1
-community|hellaswag_okapi_ar|5|1
-community|toxigen_ar|5|1
-community|sciq_ar|5|1
diff --git a/examples/tasks/OALL_v1_tasks.txt b/examples/tasks/OALL_v1_tasks.txt
new file mode 100644
index 000000000..08e9a51cd
--- /dev/null
+++ b/examples/tasks/OALL_v1_tasks.txt
@@ -0,0 +1,136 @@
+lighteval|xstory_cloze:ar|0|0
+community|arabic_mmlu_mt:abstract_algebra|0|0
+community|arabic_mmlu_mt:anatomy|0|0
+community|arabic_mmlu_mt:astronomy|0|0
+community|arabic_mmlu_mt:business_ethics|0|0
+community|arabic_mmlu_mt:clinical_knowledge|0|0
+community|arabic_mmlu_mt:college_biology|0|0
+community|arabic_mmlu_mt:college_chemistry|0|0
+community|arabic_mmlu_mt:college_computer_science|0|0
+community|arabic_mmlu_mt:college_mathematics|0|0
+community|arabic_mmlu_mt:college_medicine|0|0
+community|arabic_mmlu_mt:college_physics|0|0
+community|arabic_mmlu_mt:computer_security|0|0
+community|arabic_mmlu_mt:conceptual_physics|0|0
+community|arabic_mmlu_mt:econometrics|0|0
+community|arabic_mmlu_mt:electrical_engineering|0|0
+community|arabic_mmlu_mt:elementary_mathematics|0|0
+community|arabic_mmlu_mt:formal_logic|0|0
+community|arabic_mmlu_mt:global_facts|0|0
+community|arabic_mmlu_mt:high_school_biology|0|0
+community|arabic_mmlu_mt:high_school_chemistry|0|0
+community|arabic_mmlu_mt:high_school_computer_science|0|0
+community|arabic_mmlu_mt:high_school_european_history|0|0
+community|arabic_mmlu_mt:high_school_geography|0|0
+community|arabic_mmlu_mt:high_school_government_and_politics|0|0
+community|arabic_mmlu_mt:high_school_macroeconomics|0|0
+community|arabic_mmlu_mt:high_school_mathematics|0|0
+community|arabic_mmlu_mt:high_school_microeconomics|0|0
+community|arabic_mmlu_mt:high_school_physics|0|0
+community|arabic_mmlu_mt:high_school_psychology|0|0
+community|arabic_mmlu_mt:high_school_statistics|0|0
+community|arabic_mmlu_mt:high_school_us_history|0|0
+community|arabic_mmlu_mt:high_school_world_history|0|0
+community|arabic_mmlu_mt:human_aging|0|0
+community|arabic_mmlu_mt:human_sexuality|0|0
+community|arabic_mmlu_mt:international_law|0|0
+community|arabic_mmlu_mt:jurisprudence|0|0
+community|arabic_mmlu_mt:logical_fallacies|0|0
+community|arabic_mmlu_mt:machine_learning|0|0
+community|arabic_mmlu_mt:management|0|0
+community|arabic_mmlu_mt:marketing|0|0
+community|arabic_mmlu_mt:medical_genetics|0|0
+community|arabic_mmlu_mt:miscellaneous|0|0
+community|arabic_mmlu_mt:moral_disputes|0|0
+community|arabic_mmlu_mt:moral_scenarios|0|0
+community|arabic_mmlu_mt:nutrition|0|0
+community|arabic_mmlu_mt:philosophy|0|0
+community|arabic_mmlu_mt:prehistory|0|0
+community|arabic_mmlu_mt:professional_accounting|0|0
+community|arabic_mmlu_mt:professional_law|0|0
+community|arabic_mmlu_mt:professional_medicine|0|0
+community|arabic_mmlu_mt:professional_psychology|0|0
+community|arabic_mmlu_mt:public_relations|0|0
+community|arabic_mmlu_mt:security_studies|0|0
+community|arabic_mmlu_mt:sociology|0|0
+community|arabic_mmlu_mt:us_foreign_policy|0|0
+community|arabic_mmlu_mt:virology|0|0
+community|arabic_mmlu_mt:world_religions|0|0
+community|arabic_exams|0|0
+community|acva:Algeria|0|0
+community|acva:Ancient_Egypt|0|0
+community|acva:Arab_Empire|0|0
+community|acva:Arabic_Architecture|0|0
+community|acva:Arabic_Art|0|0
+community|acva:Arabic_Astronomy|0|0
+community|acva:Arabic_Calligraphy|0|0
+community|acva:Arabic_Ceremony|0|0
+community|acva:Arabic_Clothing|0|0
+community|acva:Arabic_Culture|0|0
+community|acva:Arabic_Food|0|0
+community|acva:Arabic_Funeral|0|0
+community|acva:Arabic_Geography|0|0
+community|acva:Arabic_History|0|0
+community|acva:Arabic_Language_Origin|0|0
+community|acva:Arabic_Literature|0|0
+community|acva:Arabic_Math|0|0
+community|acva:Arabic_Medicine|0|0
+community|acva:Arabic_Music|0|0
+community|acva:Arabic_Ornament|0|0
+community|acva:Arabic_Philosophy|0|0
+community|acva:Arabic_Physics_and_Chemistry|0|0
+community|acva:Arabic_Wedding|0|0
+community|acva:Bahrain|0|0
+community|acva:Comoros|0|0
+community|acva:Egypt_modern|0|0
+community|acva:InfluenceFromAncientEgypt|0|0
+community|acva:InfluenceFromByzantium|0|0
+community|acva:InfluenceFromChina|0|0
+community|acva:InfluenceFromGreece|0|0
+community|acva:InfluenceFromIslam|0|0
+community|acva:InfluenceFromPersia|0|0
+community|acva:InfluenceFromRome|0|0
+community|acva:Iraq|0|0
+community|acva:Islam_Education|0|0
+community|acva:Islam_branches_and_schools|0|0
+community|acva:Islamic_law_system|0|0
+community|acva:Jordan|0|0
+community|acva:Kuwait|0|0
+community|acva:Lebanon|0|0
+community|acva:Libya|0|0
+community|acva:Mauritania|0|0
+community|acva:Mesopotamia_civilization|0|0
+community|acva:Morocco|0|0
+community|acva:Oman|0|0
+community|acva:Palestine|0|0
+community|acva:Qatar|0|0
+community|acva:Saudi_Arabia|0|0
+community|acva:Somalia|0|0
+community|acva:Sudan|0|0
+community|acva:Syria|0|0
+community|acva:Tunisia|0|0
+community|acva:United_Arab_Emirates|0|0
+community|acva:Yemen|0|0
+community|acva:communication|0|0
+community|acva:computer_and_phone|0|0
+community|acva:daily_life|0|0
+community|acva:entertainment|0|0
+community|alghafa:mcq_exams_test_ar|0|0
+community|alghafa:meta_ar_dialects|0|0
+community|alghafa:meta_ar_msa|0|0
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0
+community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0
+community|alghafa:multiple_choice_rating_sentiment_task|0|0
+community|alghafa:multiple_choice_sentiment_task|0|0
+community|race_ar|0|0
+community|piqa_ar|0|0
+community|arc_easy_ar|0|0
+community|arc_challenge_okapi_ar|0|0
+community|openbook_qa_ext_ar|0|0
+community|boolq_ar|0|0
+community|copa_ext_ar|0|0
+community|hellaswag_okapi_ar|0|0
+community|toxigen_ar|0|0
+community|sciq_ar|0|0

From f2596d5031261fb6063c6543d037297ea3f3c307 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Wed, 23 Oct 2024 11:59:02 +0400
Subject: [PATCH 03/11] Create OALL_v2_tasks.txt

Tasks for v2 of OALL
---
 examples/tasks/OALL_v2_tasks.txt | 117 +++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 examples/tasks/OALL_v2_tasks.txt

diff --git a/examples/tasks/OALL_v2_tasks.txt b/examples/tasks/OALL_v2_tasks.txt
new file mode 100644
index 000000000..fc1b4f7e9
--- /dev/null
+++ b/examples/tasks/OALL_v2_tasks.txt
@@ -0,0 +1,117 @@
+community|alghafa:meta_ar_dialects|0|0
+community|alghafa:meta_ar_msa|0|0
+community|alghafa:mcq_exams_test_ar|0|0
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0
+community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0
+community|alghafa:multiple_choice_rating_sentiment_task|0|0
+community|alghafa:multiple_choice_sentiment_task|0|0
+community|arabic_exams|0|0
+community|arabic_mmlu:Islamic Studies|0|0
+community|arabic_mmlu:Islamic Studies (Middle School)|0|0
+community|arabic_mmlu:Islamic Studies (Primary School)|0|0
+community|arabic_mmlu:Islamic Studies (High School)|0|0
+community|arabic_mmlu:Driving Test|0|0
+community|arabic_mmlu:Natural Science (Middle School)|0|0
+community|arabic_mmlu:Natural Science (Primary School)|0|0
+community|arabic_mmlu:History (Middle School)|0|0
+community|arabic_mmlu:History (Primary School)|0|0
+community|arabic_mmlu:History (High School)|0|0
+community|arabic_mmlu:General Knowledge|0|0
+community|arabic_mmlu:General Knowledge (Middle School)|0|0
+community|arabic_mmlu:General Knowledge (Primary School)|0|0
+community|arabic_mmlu:Law (Professional)|0|0
+community|arabic_mmlu:Physics (High School)|0|0
+community|arabic_mmlu:Social Science (Middle School)|0|0
+community|arabic_mmlu:Social Science (Primary School)|0|0
+community|arabic_mmlu:Management (University)|0|0
+community|arabic_mmlu:Arabic Language (Middle School)|0|0
+community|arabic_mmlu:Arabic Language (Primary School)|0|0
+community|arabic_mmlu:Arabic Language (High School)|0|0
+community|arabic_mmlu:Political Science (University)|0|0
+community|arabic_mmlu:Philosophy (High School)|0|0
+community|arabic_mmlu:Accounting (University)|0|0
+community|arabic_mmlu:Computer Science (Middle School)|0|0
+community|arabic_mmlu:Computer Science (Primary School)|0|0
+community|arabic_mmlu:Computer Science (High School)|0|0
+community|arabic_mmlu:Computer Science (University)|0|0
+community|arabic_mmlu:Geography (Middle School)|0|0
+community|arabic_mmlu:Geography (Primary School)|0|0
+community|arabic_mmlu:Geography (High School)|0|0
+community|arabic_mmlu:Math (Primary School)|0|0
+community|arabic_mmlu:Biology (High School)|0|0
+community|arabic_mmlu:Economics (Middle School)|0|0
+community|arabic_mmlu:Economics (High School)|0|0
+community|arabic_mmlu:Economics (University)|0|0
+community|arabic_mmlu:Arabic Language (General)|0|0
+community|arabic_mmlu:Arabic Language (Grammar)|0|0
+community|arabic_mmlu:Civics (Middle School)|0|0
+community|arabic_mmlu:Civics (High School)|0|0
+community|madinah_qa:Arabic Language (General)|0|0
+community|madinah_qa:Arabic Language (Grammar)|0|0
+community|aratrust:Trustfulness|0|0
+community|aratrust:MentalHealth|0|0
+community|aratrust:PhysicalHealth|0|0
+community|aratrust:Offensive|0|0
+community|aratrust:Ethics|0|0
+community|aratrust:Privacy|0|0
+community|aratrust:Unfairness|0|0
+community|aratrust:Illegal|0|0
+community|arabic_mmlu_ht:abstract_algebra|0|0
+community|arabic_mmlu_ht:anatomy|0|0
+community|arabic_mmlu_ht:astronomy|0|0
+community|arabic_mmlu_ht:business_ethics|0|0
+community|arabic_mmlu_ht:clinical_knowledge|0|0
+community|arabic_mmlu_ht:college_biology|0|0
+community|arabic_mmlu_ht:college_chemistry|0|0
+community|arabic_mmlu_ht:college_computer_science|0|0
+community|arabic_mmlu_ht:college_mathematics|0|0
+community|arabic_mmlu_ht:college_medicine|0|0
+community|arabic_mmlu_ht:college_physics|0|0
+community|arabic_mmlu_ht:computer_security|0|0
+community|arabic_mmlu_ht:conceptual_physics|0|0
+community|arabic_mmlu_ht:econometrics|0|0
+community|arabic_mmlu_ht:electrical_engineering|0|0
+community|arabic_mmlu_ht:elementary_mathematics|0|0
+community|arabic_mmlu_ht:formal_logic|0|0
+community|arabic_mmlu_ht:global_facts|0|0
+community|arabic_mmlu_ht:high_school_biology|0|0
+community|arabic_mmlu_ht:high_school_chemistry|0|0
+community|arabic_mmlu_ht:high_school_computer_science|0|0
+community|arabic_mmlu_ht:high_school_european_history|0|0
+community|arabic_mmlu_ht:high_school_geography|0|0
+community|arabic_mmlu_ht:high_school_government_and_politics|0|0
+community|arabic_mmlu_ht:high_school_macroeconomics|0|0
+community|arabic_mmlu_ht:high_school_mathematics|0|0
+community|arabic_mmlu_ht:high_school_microeconomics|0|0
+community|arabic_mmlu_ht:high_school_physics|0|0
+community|arabic_mmlu_ht:high_school_psychology|0|0
+community|arabic_mmlu_ht:high_school_statistics|0|0
+community|arabic_mmlu_ht:high_school_us_history|0|0
+community|arabic_mmlu_ht:high_school_world_history|0|0
+community|arabic_mmlu_ht:human_aging|0|0
+community|arabic_mmlu_ht:human_sexuality|0|0
+community|arabic_mmlu_ht:international_law|0|0
+community|arabic_mmlu_ht:jurisprudence|0|0
+community|arabic_mmlu_ht:logical_fallacies|0|0
+community|arabic_mmlu_ht:machine_learning|0|0
+community|arabic_mmlu_ht:management|0|0
+community|arabic_mmlu_ht:marketing|0|0
+community|arabic_mmlu_ht:medical_genetics|0|0
+community|arabic_mmlu_ht:miscellaneous|0|0
+community|arabic_mmlu_ht:moral_disputes|0|0
+community|arabic_mmlu_ht:moral_scenarios|0|0
+community|arabic_mmlu_ht:nutrition|0|0
+community|arabic_mmlu_ht:philosophy|0|0
+community|arabic_mmlu_ht:prehistory|0|0
+community|arabic_mmlu_ht:professional_accounting|0|0
+community|arabic_mmlu_ht:professional_law|0|0
+community|arabic_mmlu_ht:professional_medicine|0|0
+community|arabic_mmlu_ht:professional_psychology|0|0
+community|arabic_mmlu_ht:public_relations|0|0
+community|arabic_mmlu_ht:security_studies|0|0
+community|arabic_mmlu_ht:sociology|0|0
+community|arabic_mmlu_ht:us_foreign_policy|0|0
+community|arabic_mmlu_ht:virology|0|0
+community|arabic_mmlu_ht:world_religions|0|0

From a1644720edf6af852a4360739b82633e2b3d5f0b Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Wed, 23 Oct 2024 11:59:44 +0400
Subject: [PATCH 04/11] Update all_arabic_tasks.txt

add new and renamed tasks
---
 examples/tasks/all_arabic_tasks.txt | 379 ++++++++++++++++++----------
 1 file changed, 243 insertions(+), 136 deletions(-)

diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt
index fa430ed14..8593fa2f8 100644
--- a/examples/tasks/all_arabic_tasks.txt
+++ b/examples/tasks/all_arabic_tasks.txt
@@ -1,137 +1,244 @@
 lighteval|xstory_cloze:ar|0|0
-community|arabic_mmlu:abstract_algebra|5|1
-community|arabic_mmlu:anatomy|5|1
-community|arabic_mmlu:astronomy|5|1
-community|arabic_mmlu:business_ethics|5|1
-community|arabic_mmlu:clinical_knowledge|5|1
-community|arabic_mmlu:college_biology|5|1
-community|arabic_mmlu:college_chemistry|5|1
-community|arabic_mmlu:college_computer_science|5|1
-community|arabic_mmlu:college_mathematics|5|1
-community|arabic_mmlu:college_medicine|5|1
-community|arabic_mmlu:college_physics|5|1
-community|arabic_mmlu:computer_security|5|1
-community|arabic_mmlu:conceptual_physics|5|1
-community|arabic_mmlu:econometrics|5|1
-community|arabic_mmlu:electrical_engineering|5|1
-community|arabic_mmlu:elementary_mathematics|5|1
-community|arabic_mmlu:formal_logic|5|1
-community|arabic_mmlu:global_facts|5|1
-community|arabic_mmlu:high_school_biology|5|1
-community|arabic_mmlu:high_school_chemistry|5|1
-community|arabic_mmlu:high_school_computer_science|5|1
-community|arabic_mmlu:high_school_european_history|5|1
-community|arabic_mmlu:high_school_geography|5|1
-community|arabic_mmlu:high_school_government_and_politics|5|1
-community|arabic_mmlu:high_school_macroeconomics|5|1
-community|arabic_mmlu:high_school_mathematics|5|1
-community|arabic_mmlu:high_school_microeconomics|5|1
-community|arabic_mmlu:high_school_physics|5|1
-community|arabic_mmlu:high_school_psychology|5|1
-community|arabic_mmlu:high_school_statistics|5|1
-community|arabic_mmlu:high_school_us_history|5|1
-community|arabic_mmlu:high_school_world_history|5|1
-community|arabic_mmlu:human_aging|5|1
-community|arabic_mmlu:human_sexuality|5|1
-community|arabic_mmlu:international_law|5|1
-community|arabic_mmlu:jurisprudence|5|1
-community|arabic_mmlu:logical_fallacies|5|1
-community|arabic_mmlu:machine_learning|5|1
-community|arabic_mmlu:management|5|1
-community|arabic_mmlu:marketing|5|1
-community|arabic_mmlu:medical_genetics|5|1
-community|arabic_mmlu:miscellaneous|5|1
-community|arabic_mmlu:moral_disputes|5|1
-community|arabic_mmlu:moral_scenarios|5|1
-community|arabic_mmlu:nutrition|5|1
-community|arabic_mmlu:philosophy|5|1
-community|arabic_mmlu:prehistory|5|1
-community|arabic_mmlu:professional_accounting|5|1
-community|arabic_mmlu:professional_law|5|1
-community|arabic_mmlu:professional_medicine|5|1
-community|arabic_mmlu:professional_psychology|5|1
-community|arabic_mmlu:public_relations|5|1
-community|arabic_mmlu:security_studies|5|1
-community|arabic_mmlu:sociology|5|1
-community|arabic_mmlu:us_foreign_policy|5|1
-community|arabic_mmlu:virology|5|1
-community|arabic_mmlu:world_religions|5|1
-community|arabic_exams|5|1
-community|acva:Algeria|5|1
-community|acva:Ancient_Egypt|5|1
-community|acva:Arab_Empire|5|1
-community|acva:Arabic_Architecture|5|1
-community|acva:Arabic_Art|5|1
-community|acva:Arabic_Astronomy|5|1
-community|acva:Arabic_Calligraphy|5|1
-community|acva:Arabic_Ceremony|5|1
-community|acva:Arabic_Clothing|5|1
-community|acva:Arabic_Culture|5|1
-community|acva:Arabic_Food|5|1
-community|acva:Arabic_Funeral|5|1
-community|acva:Arabic_Geography|5|1
-community|acva:Arabic_History|5|1
-community|acva:Arabic_Language_Origin|5|1
-community|acva:Arabic_Literature|5|1
-community|acva:Arabic_Math|5|1
-community|acva:Arabic_Medicine|5|1
-community|acva:Arabic_Music|5|1
-community|acva:Arabic_Ornament|5|1
-community|acva:Arabic_Philosophy|5|1
-community|acva:Arabic_Physics_and_Chemistry|5|1
-community|acva:Arabic_Wedding|5|1
-community|acva:Bahrain|5|1
-community|acva:Comoros|5|1
-community|acva:Egypt_modern|5|1
-community|acva:InfluenceFromAncientEgypt|5|1
-community|acva:InfluenceFromByzantium|5|1
-community|acva:InfluenceFromChina|5|1
-community|acva:InfluenceFromGreece|5|1
-community|acva:InfluenceFromIslam|5|1
-community|acva:InfluenceFromPersia|5|1
-community|acva:InfluenceFromRome|5|1
-community|acva:Iraq|5|1
-community|acva:Islam_Education|5|1
-community|acva:Islam_branches_and_schools|5|1
-community|acva:Islamic_law_system|5|1
-community|acva:Jordan|5|1
-community|acva:Kuwait|5|1
-community|acva:Lebanon|5|1
-community|acva:Libya|5|1
-community|acva:Mauritania|5|1
-community|acva:Mesopotamia_civilization|5|1
-community|acva:Morocco|5|1
-community|acva:Oman|5|1
-community|acva:Palestine|5|1
-community|acva:Qatar|5|1
-community|acva:Saudi_Arabia|5|1
-community|acva:Somalia|5|1
-community|acva:Sudan|5|1
-community|acva:Syria|5|1
-community|acva:Tunisia|5|1
-community|acva:United_Arab_Emirates|5|1
-community|acva:Yemen|5|1
-community|acva:communication|5|1
-community|acva:computer_and_phone|5|1
-community|acva:daily_life|5|1
-community|acva:entertainment|5|1
-community|alghafa:mcq_exams_test_ar|5|1
-community|alghafa:meta_ar_dialects|5|1
-community|alghafa:meta_ar_msa|5|1
-community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
-community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1
-community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
-community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
-community|alghafa:multiple_choice_rating_sentiment_task|5|1
-community|alghafa:multiple_choice_sentiment_task|5|1
-community|race_ar|5|1
-community|piqa_ar|5|1
-community|arc_easy_ar|5|1
-community|arc_challenge_okapi_ar|5|1
-community|mmlu_okapi_ar|5|1
-community|openbook_qa_ext_ar|5|1
-community|boolq_ar|5|1
-community|copa_ext_ar|5|1
-community|hellaswag_okapi_ar|5|1
-community|toxigen_ar|5|1
-community|sciq_ar|5|1
+community|arabic_exams|0|0
+community|arabic_mmlu_mt:abstract_algebra|0|0
+community|arabic_mmlu_mt:anatomy|0|0
+community|arabic_mmlu_mt:astronomy|0|0
+community|arabic_mmlu_mt:business_ethics|0|0
+community|arabic_mmlu_mt:clinical_knowledge|0|0
+community|arabic_mmlu_mt:college_biology|0|0
+community|arabic_mmlu_mt:college_chemistry|0|0
+community|arabic_mmlu_mt:college_computer_science|0|0
+community|arabic_mmlu_mt:college_mathematics|0|0
+community|arabic_mmlu_mt:college_medicine|0|0
+community|arabic_mmlu_mt:college_physics|0|0
+community|arabic_mmlu_mt:computer_security|0|0
+community|arabic_mmlu_mt:conceptual_physics|0|0
+community|arabic_mmlu_mt:econometrics|0|0
+community|arabic_mmlu_mt:electrical_engineering|0|0
+community|arabic_mmlu_mt:elementary_mathematics|0|0
+community|arabic_mmlu_mt:formal_logic|0|0
+community|arabic_mmlu_mt:global_facts|0|0
+community|arabic_mmlu_mt:high_school_biology|0|0
+community|arabic_mmlu_mt:high_school_chemistry|0|0
+community|arabic_mmlu_mt:high_school_computer_science|0|0
+community|arabic_mmlu_mt:high_school_european_history|0|0
+community|arabic_mmlu_mt:high_school_geography|0|0
+community|arabic_mmlu_mt:high_school_government_and_politics|0|0
+community|arabic_mmlu_mt:high_school_macroeconomics|0|0
+community|arabic_mmlu_mt:high_school_mathematics|0|0
+community|arabic_mmlu_mt:high_school_microeconomics|0|0
+community|arabic_mmlu_mt:high_school_physics|0|0
+community|arabic_mmlu_mt:high_school_psychology|0|0
+community|arabic_mmlu_mt:high_school_statistics|0|0
+community|arabic_mmlu_mt:high_school_us_history|0|0
+community|arabic_mmlu_mt:high_school_world_history|0|0
+community|arabic_mmlu_mt:human_aging|0|0
+community|arabic_mmlu_mt:human_sexuality|0|0
+community|arabic_mmlu_mt:international_law|0|0
+community|arabic_mmlu_mt:jurisprudence|0|0
+community|arabic_mmlu_mt:logical_fallacies|0|0
+community|arabic_mmlu_mt:machine_learning|0|0
+community|arabic_mmlu_mt:management|0|0
+community|arabic_mmlu_mt:marketing|0|0
+community|arabic_mmlu_mt:medical_genetics|0|0
+community|arabic_mmlu_mt:miscellaneous|0|0
+community|arabic_mmlu_mt:moral_disputes|0|0
+community|arabic_mmlu_mt:moral_scenarios|0|0
+community|arabic_mmlu_mt:nutrition|0|0
+community|arabic_mmlu_mt:philosophy|0|0
+community|arabic_mmlu_mt:prehistory|0|0
+community|arabic_mmlu_mt:professional_accounting|0|0
+community|arabic_mmlu_mt:professional_law|0|0
+community|arabic_mmlu_mt:professional_medicine|0|0
+community|arabic_mmlu_mt:professional_psychology|0|0
+community|arabic_mmlu_mt:public_relations|0|0
+community|arabic_mmlu_mt:security_studies|0|0
+community|arabic_mmlu_mt:sociology|0|0
+community|arabic_mmlu_mt:us_foreign_policy|0|0
+community|arabic_mmlu_mt:virology|0|0
+community|arabic_mmlu_mt:world_religions|0|0
+community|acva:Algeria|0|0
+community|acva:Ancient_Egypt|0|0
+community|acva:Arab_Empire|0|0
+community|acva:Arabic_Architecture|0|0
+community|acva:Arabic_Art|0|0
+community|acva:Arabic_Astronomy|0|0
+community|acva:Arabic_Calligraphy|0|0
+community|acva:Arabic_Ceremony|0|0
+community|acva:Arabic_Clothing|0|0
+community|acva:Arabic_Culture|0|0
+community|acva:Arabic_Food|0|0
+community|acva:Arabic_Funeral|0|0
+community|acva:Arabic_Geography|0|0
+community|acva:Arabic_History|0|0
+community|acva:Arabic_Language_Origin|0|0
+community|acva:Arabic_Literature|0|0
+community|acva:Arabic_Math|0|0
+community|acva:Arabic_Medicine|0|0
+community|acva:Arabic_Music|0|0
+community|acva:Arabic_Ornament|0|0
+community|acva:Arabic_Philosophy|0|0
+community|acva:Arabic_Physics_and_Chemistry|0|0
+community|acva:Arabic_Wedding|0|0
+community|acva:Bahrain|0|0
+community|acva:Comoros|0|0
+community|acva:Egypt_modern|0|0
+community|acva:InfluenceFromAncientEgypt|0|0
+community|acva:InfluenceFromByzantium|0|0
+community|acva:InfluenceFromChina|0|0
+community|acva:InfluenceFromGreece|0|0
+community|acva:InfluenceFromIslam|0|0
+community|acva:InfluenceFromPersia|0|0
+community|acva:InfluenceFromRome|0|0
+community|acva:Iraq|0|0
+community|acva:Islam_Education|0|0
+community|acva:Islam_branches_and_schools|0|0
+community|acva:Islamic_law_system|0|0
+community|acva:Jordan|0|0
+community|acva:Kuwait|0|0
+community|acva:Lebanon|0|0
+community|acva:Libya|0|0
+community|acva:Mauritania|0|0
+community|acva:Mesopotamia_civilization|0|0
+community|acva:Morocco|0|0
+community|acva:Oman|0|0
+community|acva:Palestine|0|0
+community|acva:Qatar|0|0
+community|acva:Saudi_Arabia|0|0
+community|acva:Somalia|0|0
+community|acva:Sudan|0|0
+community|acva:Syria|0|0
+community|acva:Tunisia|0|0
+community|acva:United_Arab_Emirates|0|0
+community|acva:Yemen|0|0
+community|acva:communication|0|0
+community|acva:computer_and_phone|0|0
+community|acva:daily_life|0|0
+community|acva:entertainment|0|0
+community|alghafa:mcq_exams_test_ar|0|0
+community|alghafa:meta_ar_dialects|0|0
+community|alghafa:meta_ar_msa|0|0
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0
+community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0
+community|alghafa:multiple_choice_rating_sentiment_task|0|0
+community|alghafa:multiple_choice_sentiment_task|0|0
+community|race_ar|0|0
+community|piqa_ar|0|0
+community|arc_easy_ar|0|0
+community|arc_challenge_okapi_ar|0|0
+community|mmlu_okapi_ar|0|0
+community|openbook_qa_ext_ar|0|0
+community|boolq_ar|0|0
+community|copa_ext_ar|0|0
+community|hellaswag_okapi_ar|0|0
+community|toxigen_ar|0|0
+community|sciq_ar|0|0
+community|arabic_mmlu_ht:abstract_algebra|0|0
+community|arabic_mmlu_ht:anatomy|0|0
+community|arabic_mmlu_ht:astronomy|0|0
+community|arabic_mmlu_ht:business_ethics|0|0
+community|arabic_mmlu_ht:clinical_knowledge|0|0
+community|arabic_mmlu_ht:college_biology|0|0
+community|arabic_mmlu_ht:college_chemistry|0|0
+community|arabic_mmlu_ht:college_computer_science|0|0
+community|arabic_mmlu_ht:college_mathematics|0|0
+community|arabic_mmlu_ht:college_medicine|0|0
+community|arabic_mmlu_ht:college_physics|0|0
+community|arabic_mmlu_ht:computer_security|0|0
+community|arabic_mmlu_ht:conceptual_physics|0|0
+community|arabic_mmlu_ht:econometrics|0|0
+community|arabic_mmlu_ht:electrical_engineering|0|0
+community|arabic_mmlu_ht:elementary_mathematics|0|0
+community|arabic_mmlu_ht:formal_logic|0|0
+community|arabic_mmlu_ht:global_facts|0|0
+community|arabic_mmlu_ht:high_school_biology|0|0
+community|arabic_mmlu_ht:high_school_chemistry|0|0
+community|arabic_mmlu_ht:high_school_computer_science|0|0
+community|arabic_mmlu_ht:high_school_european_history|0|0
+community|arabic_mmlu_ht:high_school_geography|0|0
+community|arabic_mmlu_ht:high_school_government_and_politics|0|0
+community|arabic_mmlu_ht:high_school_macroeconomics|0|0
+community|arabic_mmlu_ht:high_school_mathematics|0|0
+community|arabic_mmlu_ht:high_school_microeconomics|0|0
+community|arabic_mmlu_ht:high_school_physics|0|0
+community|arabic_mmlu_ht:high_school_psychology|0|0
+community|arabic_mmlu_ht:high_school_statistics|0|0
+community|arabic_mmlu_ht:high_school_us_history|0|0
+community|arabic_mmlu_ht:high_school_world_history|0|0
+community|arabic_mmlu_ht:human_aging|0|0
+community|arabic_mmlu_ht:human_sexuality|0|0
+community|arabic_mmlu_ht:international_law|0|0
+community|arabic_mmlu_ht:jurisprudence|0|0
+community|arabic_mmlu_ht:logical_fallacies|0|0
+community|arabic_mmlu_ht:machine_learning|0|0
+community|arabic_mmlu_ht:management|0|0
+community|arabic_mmlu_ht:marketing|0|0
+community|arabic_mmlu_ht:medical_genetics|0|0
+community|arabic_mmlu_ht:miscellaneous|0|0
+community|arabic_mmlu_ht:moral_disputes|0|0
+community|arabic_mmlu_ht:moral_scenarios|0|0
+community|arabic_mmlu_ht:nutrition|0|0
+community|arabic_mmlu_ht:philosophy|0|0
+community|arabic_mmlu_ht:prehistory|0|0
+community|arabic_mmlu_ht:professional_accounting|0|0
+community|arabic_mmlu_ht:professional_law|0|0
+community|arabic_mmlu_ht:professional_medicine|0|0
+community|arabic_mmlu_ht:professional_psychology|0|0
+community|arabic_mmlu_ht:public_relations|0|0
+community|arabic_mmlu_ht:security_studies|0|0
+community|arabic_mmlu_ht:sociology|0|0
+community|arabic_mmlu_ht:us_foreign_policy|0|0
+community|arabic_mmlu_ht:virology|0|0
+community|arabic_mmlu_ht:world_religions|0|0
+community|arabic_mmlu:Islamic Studies|0|0
+community|arabic_mmlu:Islamic Studies (Middle School)|0|0
+community|arabic_mmlu:Islamic Studies (Primary School)|0|0
+community|arabic_mmlu:Islamic Studies (High School)|0|0
+community|arabic_mmlu:Driving Test|0|0
+community|arabic_mmlu:Natural Science (Middle School)|0|0
+community|arabic_mmlu:Natural Science (Primary School)|0|0
+community|arabic_mmlu:History (Middle School)|0|0
+community|arabic_mmlu:History (Primary School)|0|0
+community|arabic_mmlu:History (High School)|0|0
+community|arabic_mmlu:General Knowledge|0|0
+community|arabic_mmlu:General Knowledge (Middle School)|0|0
+community|arabic_mmlu:General Knowledge (Primary School)|0|0
+community|arabic_mmlu:Law (Professional)|0|0
+community|arabic_mmlu:Physics (High School)|0|0
+community|arabic_mmlu:Social Science (Middle School)|0|0
+community|arabic_mmlu:Social Science (Primary School)|0|0
+community|arabic_mmlu:Management (University)|0|0
+community|arabic_mmlu:Arabic Language (Middle School)|0|0
+community|arabic_mmlu:Arabic Language (Primary School)|0|0
+community|arabic_mmlu:Arabic Language (High School)|0|0
+community|arabic_mmlu:Political Science (University)|0|0
+community|arabic_mmlu:Philosophy (High School)|0|0
+community|arabic_mmlu:Accounting (University)|0|0
+community|arabic_mmlu:Computer Science (Middle School)|0|0
+community|arabic_mmlu:Computer Science (Primary School)|0|0
+community|arabic_mmlu:Computer Science (High School)|0|0
+community|arabic_mmlu:Computer Science (University)|0|0
+community|arabic_mmlu:Geography (Middle School)|0|0
+community|arabic_mmlu:Geography (Primary School)|0|0
+community|arabic_mmlu:Geography (High School)|0|0
+community|arabic_mmlu:Math (Primary School)|0|0
+community|arabic_mmlu:Biology (High School)|0|0
+community|arabic_mmlu:Economics (Middle School)|0|0
+community|arabic_mmlu:Economics (High School)|0|0
+community|arabic_mmlu:Economics (University)|0|0
+community|arabic_mmlu:Arabic Language (General)|0|0
+community|arabic_mmlu:Arabic Language (Grammar)|0|0
+community|arabic_mmlu:Civics (Middle School)|0|0
+community|arabic_mmlu:Civics (High School)|0|0
+community|madinah_qa:Arabic Language (General)|0|0
+community|madinah_qa:Arabic Language (Grammar)|0|0
+community|aratrust:Trustfulness|0|0
+community|aratrust:MentalHealth|0|0
+community|aratrust:PhysicalHealth|0|0
+community|aratrust:Offensive|0|0
+community|aratrust:Ethics|0|0
+community|aratrust:Privacy|0|0
+community|aratrust:Unfairness|0|0
+community|aratrust:Illegal|0|0

From b6d61dcf2a2628aab7d6ee1b710b20f2f0558593 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Thu, 31 Oct 2024 15:31:52 +0400
Subject: [PATCH 05/11] Update arabic_evals.py

Fix formatting issues for
---
 community_tasks/arabic_evals.py | 74 +++++++++++++++------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 1f95396fb..f48724a60 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -54,38 +54,32 @@
 
 
 def arabic_mmlu_pfn(line, task_name: str = None):
-    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
-    
+    instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
+
     # Define the mapping from Latin to Arabic letters
-    latin_to_arabic = {
-        'A': 'أ',
-        'B': 'ب',
-        'C': 'ج',
-        'D': 'د',
-        'E': 'هـ'
-    }
-    
+    latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"}
+
     # Create a list of valid choices with corresponding Arabic keys
     choices = []
     valid_keys_latin = []
     valid_keys_arabic = []
-    
+
     # Enumerate through the options and append the valid ones
-    for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']):
+    for idx, key in enumerate(["A", "B", "C", "D", "E"]):
         option = line.get(f"Option {idx + 1}")
         if option:  # Check if option is not null
             choices.append(option)
             valid_keys_latin.append(key)  # Append the Latin key (A, B, C, D, E)
             valid_keys_arabic.append(latin_to_arabic[key])  # Append the corresponding Arabic letter
-    
+
     # Find the correct index for the answer key in the Arabic version
     answer_index = valid_keys_latin.index(line["Answer Key"])
-    
+
     # Construct the query with Arabic letters
     query = f"{instruction}{line['Question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)])
     query += "الإجابة:"
-    
+
     return Doc(
         task_name=task_name,
         query=query,
@@ -143,9 +137,9 @@ def __init__(
 
 
 def arabic_mmlu_ht_pfn(line, task_name: str = None):
-    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
+    instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
     choices = line["choices"]
-    answer_index = line["answer"] # It is an int reflecting the index of correct answer in line["choices"]
+    answer_index = line["answer"]  # It is an int reflecting the index of correct answer in line["choices"]
 
     query = f"{instruction}{line['question']}\n"
     query += "".join([f"{idx}. {choice}\n" for idx, choice in enumerate(choices, start=1)])
@@ -207,11 +201,13 @@ def __init__(
 
 
 def arabic_mmlu_mt_pfn(line, task_name: str = None):
-    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n"
+    instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n"
     choices = [line["A"], line["B"], line["C"], line["D"]]
     # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
     # it will then be applied to arabic letters
-    answer_index = LETTER_INDICES.index(line["answer"]) # line["answer"] is the correct answer. That's why we need to index it !
+    answer_index = LETTER_INDICES.index(
+        line["answer"]
+    )  # line["answer"] is the correct answer. That's why we need to index it !
 
     query = f"{instruction}{line['question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)])
@@ -259,11 +255,13 @@ def __init__(
 
 
 def arabic_mmmlu_pfn(line, task_name: str = None):
-    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n"
+    instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n"
     choices = [line["A"], line["B"], line["C"], line["D"]]
     # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
     # it will then be applied to arabic letters
-    answer_index = LETTER_INDICES.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it !
+    answer_index = LETTER_INDICES.index(
+        line["Answer"]
+    )  # line["answer"] is the correct answer. That's why we need to index it !
 
     query = f"{instruction}{line['Question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)])
@@ -361,11 +359,13 @@ def __init__(
 
 
 def aratrust_pfn(line, task_name: str = None):
-    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n"
+    instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n"
     choices = [line["A"], line["B"], line["C"]]
     # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
     # it will then be applied to arabic letters
-    answer_index = LETTER_INDICES_AR.index(line["Answer"]) # line["answer"] is the correct answer. That's why we need to index it !
+    answer_index = LETTER_INDICES_AR.index(
+        line["Answer"]
+    )  # line["answer"] is the correct answer. That's why we need to index it !
 
     query = f"{instruction}{line['Question']}\n"
     query += "".join([f"{choice}\n" for choice in choices])
@@ -392,7 +392,9 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=aratrust_pfn,
             hf_repo="asas-ai/AraTrust-categorized",
-            metric=[Metrics.f1_score], # Following the paper (AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic)[https://arxiv.org/abs/2403.09017]
+            metric=[
+                Metrics.f1_score
+            ],  # Following the paper (AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic)[https://arxiv.org/abs/2403.09017]
             hf_avail_splits=["train"],
             evaluation_splits=["train"],
             few_shots_split=None,
@@ -407,9 +409,7 @@ def __init__(
         )
 
 
-ARATRUST_TASKS = [
-    CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS
-]
+ARATRUST_TASKS = [CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS]
 
 
 def arabic_exams_pfn(line, task_name: str = None):
@@ -834,30 +834,24 @@ def sciq_arabic_pfn(line, task_name: str = None):
 
 
 def madinah_qa_pfn(line, task_name: str = None):
-    instruction = f"السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
-    
+    instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n"
+
     # Define the mapping from Latin to Arabic letters
-    latin_to_arabic = {
-        'A': 'أ',
-        'B': 'ب',
-        'C': 'ج',
-        'D': 'د',
-        'E': 'هـ'
-    }
-    
+    latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"}
+
     # Create a list of valid choices with corresponding Arabic keys
     choices = []
     valid_keys_latin = []
     valid_keys_arabic = []
-    
+
     # Enumerate through the options and append the valid ones
-    for idx, key in enumerate(['A', 'B', 'C', 'D', 'E']):
+    for idx, key in enumerate(["A", "B", "C", "D", "E"]):
         option = line.get(f"Option {idx + 1}")
         if option:  # Check if option is not null
             choices.append(option)
             valid_keys_latin.append(key)  # Append the Latin key (A, B, C, D, E)
             valid_keys_arabic.append(latin_to_arabic[key])  # Append the corresponding Arabic letter
-    
+
     # Find the correct index for the answer key in the Arabic version
     answer_index = valid_keys_latin.index(line["Answer Key"])
 

From 91aa0e18267917574fab0c501b3b935ddd13f61b Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Mon, 18 Nov 2024 14:00:20 +0400
Subject: [PATCH 06/11] Update all_arabic_tasks.txt

Add missing task: OpenAI's MMMLU arabic subset
---
 examples/tasks/all_arabic_tasks.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt
index 8593fa2f8..8b62f61b3 100644
--- a/examples/tasks/all_arabic_tasks.txt
+++ b/examples/tasks/all_arabic_tasks.txt
@@ -231,6 +231,7 @@ community|arabic_mmlu:Economics (University)|0|0
 community|arabic_mmlu:Arabic Language (General)|0|0
 community|arabic_mmlu:Arabic Language (Grammar)|0|0
 community|arabic_mmlu:Civics (Middle School)|0|0
+community|arabic_mmmlu|0|0
 community|arabic_mmlu:Civics (High School)|0|0
 community|madinah_qa:Arabic Language (General)|0|0
 community|madinah_qa:Arabic Language (Grammar)|0|0

From 7e163e2974b7eee077379bd8865938a46ff9cce1 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Mon, 18 Nov 2024 14:01:38 +0400
Subject: [PATCH 07/11] Update all_arabic_tasks.txt

Correct order
---
 examples/tasks/all_arabic_tasks.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt
index 8b62f61b3..fbb575143 100644
--- a/examples/tasks/all_arabic_tasks.txt
+++ b/examples/tasks/all_arabic_tasks.txt
@@ -231,8 +231,8 @@ community|arabic_mmlu:Economics (University)|0|0
 community|arabic_mmlu:Arabic Language (General)|0|0
 community|arabic_mmlu:Arabic Language (Grammar)|0|0
 community|arabic_mmlu:Civics (Middle School)|0|0
-community|arabic_mmmlu|0|0
 community|arabic_mmlu:Civics (High School)|0|0
+community|arabic_mmmlu|0|0
 community|madinah_qa:Arabic Language (General)|0|0
 community|madinah_qa:Arabic Language (Grammar)|0|0
 community|aratrust:Trustfulness|0|0

From aa201d25fd62e49a01eacd797e03e261bed51cfd Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:55:26 +0400
Subject: [PATCH 08/11] Update arabic_evals.py

remove openai mmmlu task following the discussion here: https://github.com/huggingface/lighteval/pull/372
---
 community_tasks/arabic_evals.py | 41 ---------------------------------
 1 file changed, 41 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index f48724a60..ed284d838 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -254,46 +254,6 @@ def __init__(
 ]
 
 
-def arabic_mmmlu_pfn(line, task_name: str = None):
-    instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n"
-    choices = [line["A"], line["B"], line["C"], line["D"]]
-    # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
-    # it will then be applied to arabic letters
-    answer_index = LETTER_INDICES.index(
-        line["Answer"]
-    )  # line["answer"] is the correct answer. That's why we need to index it !
-
-    query = f"{instruction}{line['Question']}\n"
-    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)])
-    query += "الإجابة:"
-
-    return Doc(
-        task_name=task_name,
-        query=query,
-        choices=LETTER_INDICES_AR[:4],
-        gold_index=answer_index,
-        instruction=instruction,
-        target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
-    )
-
-
-# ARABIC MMMLU (OpenAI) ##
-arabic_mmmlu_task = LightevalTaskConfig(
-    name="arabic_mmmlu",
-    prompt_function=arabic_mmmlu_pfn,
-    suite=["community"],
-    hf_repo="openai/MMMLU",
-    hf_subset="AR_XY",
-    hf_avail_splits=["test"],
-    evaluation_splits=["test"],
-    few_shots_split=None,
-    few_shots_select=None,
-    metric=[Metrics.loglikelihood_acc_norm],
-    trust_dataset=True,
-    version=0,
-)
-
-
 # ACVA ##
 # fmt: off
 ACVA_SUBSETS = [
@@ -904,7 +864,6 @@ def __init__(
     ARABIC_MMLU_TASKS
     + ARABIC_MMLU_HT_TASKS
     + ARABIC_MMLU_MT_TASKS
-    + [arabic_mmmlu_task]
     + ACVA_TASKS
     + ALGHAFA_TASKS
     + ARATRUST_TASKS

From 81255aee450a2dd62d6ce15b895769208936f51c Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:56:13 +0400
Subject: [PATCH 09/11] Update all_arabic_tasks.txt

remove openai mmmlu task following the discussion here: https://github.com/huggingface/lighteval/pull/372
---
 examples/tasks/all_arabic_tasks.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt
index fbb575143..8593fa2f8 100644
--- a/examples/tasks/all_arabic_tasks.txt
+++ b/examples/tasks/all_arabic_tasks.txt
@@ -232,7 +232,6 @@ community|arabic_mmlu:Arabic Language (General)|0|0
 community|arabic_mmlu:Arabic Language (Grammar)|0|0
 community|arabic_mmlu:Civics (Middle School)|0|0
 community|arabic_mmlu:Civics (High School)|0|0
-community|arabic_mmmlu|0|0
 community|madinah_qa:Arabic Language (General)|0|0
 community|madinah_qa:Arabic Language (Grammar)|0|0
 community|aratrust:Trustfulness|0|0

From b8869cbee3c41e0c7c0667bf5f1623a0a21e3bf0 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 29 Nov 2024 12:50:38 +0400
Subject: [PATCH 10/11] Update tasks.py

Adding a templated version of arabic mmlu based on @hynky1999 request in the #372 PR
---
 src/lighteval/tasks/multilingual/tasks.py | 38 +++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py
index c788871ba..e41cc4615 100644
--- a/src/lighteval/tasks/multilingual/tasks.py
+++ b/src/lighteval/tasks/multilingual/tasks.py
@@ -60,6 +60,8 @@
 from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS
 from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro
 
+# Import for "arabic_mmlu_templated_tasks"
+from lighteval.tasks.requests import Doc
 
 TASKS_TABLE = []
 # ------------------------------- NLI Tasks ------------------------------- #
@@ -2031,6 +2033,42 @@
     ]
 ]
 
+
+# definition of templated version of arabic_mmlu
+arabic_mmlu_templated_tasks = [
+    LightevalTaskConfig(
+        name=f"templated_mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}",
+        prompt_function=get_mcq_prompt_function(
+            Language.ARABIC,
+            lambda line: {
+                "instruction": "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:",
+                "context": line["Context"],
+                "question": line["Question"],
+                "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o],
+                "gold_idx": LETTER_INDICES.index(line["Answer Key"]),
+            },
+            formulation=formulation,
+        ),
+        suite=("lighteval",),
+        hf_repo="MBZUAI/ArabicMMLU",
+        hf_subset=subset,
+        evaluation_splits=("test",),
+        hf_avail_splits=["dev"],
+        metric=get_metrics_for_formulation(
+            formulation,
+            [
+                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+            ],
+        ),
+    )
+    for subset in ARABIC_MMLU_SUBSETS
+    for formulation in [
+        MCFFormulation("NativeLetters"),
+    ]
+]
+
+
 TURKISH_MMLU_SUBSET = [
     "Biology",
     "Chemistry",

From bdb2867d9ccd2758c3814f7c09cd2a1b40cf835d Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Sun, 8 Dec 2024 16:53:38 +0400
Subject: [PATCH 11/11] Update tasks.py

remove arabic_mmlu_templated_tasks
---
 src/lighteval/tasks/multilingual/tasks.py | 37 -----------------------
 1 file changed, 37 deletions(-)

diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py
index e41cc4615..899d6d0c7 100644
--- a/src/lighteval/tasks/multilingual/tasks.py
+++ b/src/lighteval/tasks/multilingual/tasks.py
@@ -60,8 +60,6 @@
 from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS
 from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro
 
-# Import for "arabic_mmlu_templated_tasks"
-from lighteval.tasks.requests import Doc
 
 TASKS_TABLE = []
 # ------------------------------- NLI Tasks ------------------------------- #
@@ -2034,41 +2032,6 @@
 ]
 
 
-# definition of templated version of arabic_mmlu
-arabic_mmlu_templated_tasks = [
-    LightevalTaskConfig(
-        name=f"templated_mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}",
-        prompt_function=get_mcq_prompt_function(
-            Language.ARABIC,
-            lambda line: {
-                "instruction": "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:",
-                "context": line["Context"],
-                "question": line["Question"],
-                "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o],
-                "gold_idx": LETTER_INDICES.index(line["Answer Key"]),
-            },
-            formulation=formulation,
-        ),
-        suite=("lighteval",),
-        hf_repo="MBZUAI/ArabicMMLU",
-        hf_subset=subset,
-        evaluation_splits=("test",),
-        hf_avail_splits=["dev"],
-        metric=get_metrics_for_formulation(
-            formulation,
-            [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-            ],
-        ),
-    )
-    for subset in ARABIC_MMLU_SUBSETS
-    for formulation in [
-        MCFFormulation("NativeLetters"),
-    ]
-]
-
-
 TURKISH_MMLU_SUBSET = [
     "Biology",
     "Chemistry",