From 859c5f0ddf11dc6c9aa3065061d06823df35fffc Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 5 Mar 2024 17:32:34 +0100
Subject: [PATCH 01/27] Update arabic_evals.py

Add Support for the AlGhafa benchmarking suite
---
 community_tasks/arabic_evals.py | 72 +++++++++++++++++++++++++++++++--
 1 file changed, 68 insertions(+), 4 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 07ef6b327..4e5aefe7b 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -61,7 +61,8 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function="mmlu_arabic",
             hf_repo="OALL/Arabic_MMLU",
-            metric=["loglikelihood_acc"],
+            metric=["loglikelihood_acc_norm"],
+            # metric=["loglikelihood_acc"],
             hf_avail_splits=["test", "dev"],
             evaluation_splits=["test"],
             few_shots_split="dev",
@@ -128,7 +129,8 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function="acva",
             hf_repo="OALL/ACVA",
-            metric=["loglikelihood_acc"],
+            metric=["loglikelihood_acc_norm"],
+            # metric=["loglikelihood_acc"],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",
@@ -168,7 +170,8 @@ def acva(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -196,7 +199,68 @@ def arabic_exams(line, task_name: str = None):
     )
 
 
-_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + [arabic_exams_task]
+## ALGHAFA ##
+# fmt: off
+ALGHAFA_SUBSETS = [
+    "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_copa_translated_task", "multiple_choice_facts_truefalse_balanced_task",
+    "multiple_choice_grounded_statement_soqal_task", "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_openbookqa_translated_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task",
+    "multiple_choice_sentiment_task"
+]
+# fmt: on
+
+
+class CustomALGHAFATask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function="Alghafa",
+            hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark",
+            metric=["loglikelihood_acc_norm"],
+            # metric=["loglikelihood_acc"],
+            hf_avail_splits=["test", "validation"],
+            evaluation_splits=["test"],
+            few_shots_split="validation",
+            few_shots_select="sequential",
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=None,
+            output_regex=None,
+            frozen=False,
+        )
+
+
+ALGHAFA_TASKS = [CustomALGHAFATask(name=f"Alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS]
+
+
+def Alghafa(line, task_name: str = None):
+    question = line["query"]
+    answer_index = int(line["label"])
+    # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label'
+    choices_keys = [key for key in line.keys() if key not in ["query", "label", "__few_shots"]]
+    choices = [line[key] for key in choices_keys]
+
+    instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
+    query = f"{instruction}السؤال: {question}\n"
+    for index, choice in enumerate(choices):
+        query += f"{index}) {choice}\n"
+    query += "الإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=choices,
+        gold_index=answer_index,
+        instruction=instruction,
+        target_for_fewshot_sorting=choices[answer_index],
+    )
+
+
+_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task]
 
 # Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]

From 6249e1f5b9df9df14cf34b90a4374cfaafaeb904 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 5 Mar 2024 17:37:12 +0100
Subject: [PATCH 02/27] Update OALL_tasks.txt

Adding support to the AlGhafa benchmarking suite
---
 tasks_examples/OALL_tasks.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt
index 5428fba49..f6ef495ea 100644
--- a/tasks_examples/OALL_tasks.txt
+++ b/tasks_examples/OALL_tasks.txt
@@ -114,4 +114,15 @@ community|acva:communication|5|1
 community|acva:computer_and_phone|5|1
 community|acva:daily_life|5|1
 community|acva:entertainment|5|1
+community|Alghafa:mcq_exams_test_ar|5|1
+community|Alghafa:meta_ar_dialects|5|1
+community|Alghafa:meta_ar_msa|5|1
+community|Alghafa:multiple_choice_copa_translated_task|5|1
+community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
+community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1
+community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
+community|Alghafa:multiple_choice_openbookqa_translated_task|5|1
+community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
+community|Alghafa:multiple_choice_rating_sentiment_task|5|1
+community|Alghafa:multiple_choice_sentiment_task|5|1
 lighteval|xstory_cloze:ar|0|0

From d30d1ed19ebb566e09cfad28f6ff2b7d81714453 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Wed, 6 Mar 2024 22:10:53 +0100
Subject: [PATCH 03/27] Update arabic_evals.py

remove translated from AlGhafa
---
 community_tasks/arabic_evals.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 4e5aefe7b..2fc1a2807 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -202,9 +202,9 @@ def arabic_exams(line, task_name: str = None):
 ## ALGHAFA ##
 # fmt: off
 ALGHAFA_SUBSETS = [
-    "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_copa_translated_task", "multiple_choice_facts_truefalse_balanced_task",
-    "multiple_choice_grounded_statement_soqal_task", "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_openbookqa_translated_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task",
-    "multiple_choice_sentiment_task"
+    "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", 
+    "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", 
+    "multiple_choice_sentiment_task", # "multiple_choice_openbookqa_translated_task", "multiple_choice_copa_translated_task" ### TODO : clean up this later !
 ]
 # fmt: on
 
@@ -219,7 +219,7 @@ def __init__(
             name=name,
             hf_subset=hf_subset,
             prompt_function="Alghafa",
-            hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark",
+            hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
             metric=["loglikelihood_acc_norm"],
             # metric=["loglikelihood_acc"],
             hf_avail_splits=["test", "validation"],

From 7f1e657a0c47ac0201de52b71200b00856a3c73e Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Thu, 7 Mar 2024 23:31:46 +0100
Subject: [PATCH 04/27] Create all_arabic_tasks.txt

This file now contains all the arabic tasks including tasks not present in OALL_tasks.txt
---
 tasks_examples/all_arabic_tasks.txt | 139 ++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 tasks_examples/all_arabic_tasks.txt

diff --git a/tasks_examples/all_arabic_tasks.txt b/tasks_examples/all_arabic_tasks.txt
new file mode 100644
index 000000000..578f934a5
--- /dev/null
+++ b/tasks_examples/all_arabic_tasks.txt
@@ -0,0 +1,139 @@
+lighteval|xstory_cloze:ar|0|0
+community|arabic_mmlu:abstract_algebra|5|1
+community|arabic_mmlu:anatomy|5|1
+community|arabic_mmlu:astronomy|5|1
+community|arabic_mmlu:business_ethics|5|1
+community|arabic_mmlu:clinical_knowledge|5|1
+community|arabic_mmlu:college_biology|5|1
+community|arabic_mmlu:college_chemistry|5|1
+community|arabic_mmlu:college_computer_science|5|1
+community|arabic_mmlu:college_mathematics|5|1
+community|arabic_mmlu:college_medicine|5|1
+community|arabic_mmlu:college_physics|5|1
+community|arabic_mmlu:computer_security|5|1
+community|arabic_mmlu:conceptual_physics|5|1
+community|arabic_mmlu:econometrics|5|1
+community|arabic_mmlu:electrical_engineering|5|1
+community|arabic_mmlu:elementary_mathematics|5|1
+community|arabic_mmlu:formal_logic|5|1
+community|arabic_mmlu:global_facts|5|1
+community|arabic_mmlu:high_school_biology|5|1
+community|arabic_mmlu:high_school_chemistry|5|1
+community|arabic_mmlu:high_school_computer_science|5|1
+community|arabic_mmlu:high_school_european_history|5|1
+community|arabic_mmlu:high_school_geography|5|1
+community|arabic_mmlu:high_school_government_and_politics|5|1
+community|arabic_mmlu:high_school_macroeconomics|5|1
+community|arabic_mmlu:high_school_mathematics|5|1
+community|arabic_mmlu:high_school_microeconomics|5|1
+community|arabic_mmlu:high_school_physics|5|1
+community|arabic_mmlu:high_school_psychology|5|1
+community|arabic_mmlu:high_school_statistics|5|1
+community|arabic_mmlu:high_school_us_history|5|1
+community|arabic_mmlu:high_school_world_history|5|1
+community|arabic_mmlu:human_aging|5|1
+community|arabic_mmlu:human_sexuality|5|1
+community|arabic_mmlu:international_law|5|1
+community|arabic_mmlu:jurisprudence|5|1
+community|arabic_mmlu:logical_fallacies|5|1
+community|arabic_mmlu:machine_learning|5|1
+community|arabic_mmlu:management|5|1
+community|arabic_mmlu:marketing|5|1
+community|arabic_mmlu:medical_genetics|5|1
+community|arabic_mmlu:miscellaneous|5|1
+community|arabic_mmlu:moral_disputes|5|1
+community|arabic_mmlu:moral_scenarios|5|1
+community|arabic_mmlu:nutrition|5|1
+community|arabic_mmlu:philosophy|5|1
+community|arabic_mmlu:prehistory|5|1
+community|arabic_mmlu:professional_accounting|5|1
+community|arabic_mmlu:professional_law|5|1
+community|arabic_mmlu:professional_medicine|5|1
+community|arabic_mmlu:professional_psychology|5|1
+community|arabic_mmlu:public_relations|5|1
+community|arabic_mmlu:security_studies|5|1
+community|arabic_mmlu:sociology|5|1
+community|arabic_mmlu:us_foreign_policy|5|1
+community|arabic_mmlu:virology|5|1
+community|arabic_mmlu:world_religions|5|1
+community|arabic_exams|5|1
+community|acva:Algeria|5|1
+community|acva:Ancient_Egypt|5|1
+community|acva:Arab_Empire|5|1
+community|acva:Arabic_Architecture|5|1
+community|acva:Arabic_Art|5|1
+community|acva:Arabic_Astronomy|5|1
+community|acva:Arabic_Calligraphy|5|1
+community|acva:Arabic_Ceremony|5|1
+community|acva:Arabic_Clothing|5|1
+community|acva:Arabic_Culture|5|1
+community|acva:Arabic_Food|5|1
+community|acva:Arabic_Funeral|5|1
+community|acva:Arabic_Geography|5|1
+community|acva:Arabic_History|5|1
+community|acva:Arabic_Language_Origin|5|1
+community|acva:Arabic_Literature|5|1
+community|acva:Arabic_Math|5|1
+community|acva:Arabic_Medicine|5|1
+community|acva:Arabic_Music|5|1
+community|acva:Arabic_Ornament|5|1
+community|acva:Arabic_Philosophy|5|1
+community|acva:Arabic_Physics_and_Chemistry|5|1
+community|acva:Arabic_Wedding|5|1
+community|acva:Bahrain|5|1
+community|acva:Comoros|5|1
+community|acva:Egypt_modern|5|1
+community|acva:InfluenceFromAncientEgypt|5|1
+community|acva:InfluenceFromByzantium|5|1
+community|acva:InfluenceFromChina|5|1
+community|acva:InfluenceFromGreece|5|1
+community|acva:InfluenceFromIslam|5|1
+community|acva:InfluenceFromPersia|5|1
+community|acva:InfluenceFromRome|5|1
+community|acva:Iraq|5|1
+community|acva:Islam_Education|5|1
+community|acva:Islam_branches_and_schools|5|1
+community|acva:Islamic_law_system|5|1
+community|acva:Jordan|5|1
+community|acva:Kuwait|5|1
+community|acva:Lebanon|5|1
+community|acva:Libya|5|1
+community|acva:Mauritania|5|1
+community|acva:Mesopotamia_civilization|5|1
+community|acva:Morocco|5|1
+community|acva:Oman|5|1
+community|acva:Palestine|5|1
+community|acva:Qatar|5|1
+community|acva:Saudi_Arabia|5|1
+community|acva:Somalia|5|1
+community|acva:Sudan|5|1
+community|acva:Syria|5|1
+community|acva:Tunisia|5|1
+community|acva:United_Arab_Emirates|5|1
+community|acva:Yemen|5|1
+community|acva:communication|5|1
+community|acva:computer_and_phone|5|1
+community|acva:daily_life|5|1
+community|acva:entertainment|5|1
+community|Alghafa:mcq_exams_test_ar|5|1
+community|Alghafa:meta_ar_dialects|5|1
+community|Alghafa:meta_ar_msa|5|1
+community|Alghafa:multiple_choice_copa_translated_task|5|1
+community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
+community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1
+community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
+community|Alghafa:multiple_choice_openbookqa_translated_task|5|1
+community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
+community|Alghafa:multiple_choice_rating_sentiment_task|5|1
+community|Alghafa:multiple_choice_sentiment_task|5|1
+community|race_ar|5|1
+community|piqa_ar|5|1
+community|arc_easy_ar|5|1
+community|arc_challenge_okapi_ar|5|1
+community|mmlu_okapi_ar|5|1
+community|openbook_qa_ext_ar|5|1
+community|boolq_ar|5|1
+community|copa_ext_ar|5|1
+community|hellaswag_okapi_ar|5|1
+community|toxigen_ar|5|1
+community|sciq_ar|5|1

From 129733baeb49fc350d01cccffd0c7c4574346684 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Thu, 7 Mar 2024 23:33:12 +0100
Subject: [PATCH 05/27] Update OALL_tasks.txt

Add support for ALGHAFA TRANSLATED  tasks
---
 tasks_examples/OALL_tasks.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt
index f6ef495ea..549039e18 100644
--- a/tasks_examples/OALL_tasks.txt
+++ b/tasks_examples/OALL_tasks.txt
@@ -1,3 +1,4 @@
+lighteval|xstory_cloze:ar|0|0
 community|arabic_mmlu:abstract_algebra|5|1
 community|arabic_mmlu:anatomy|5|1
 community|arabic_mmlu:astronomy|5|1
@@ -125,4 +126,13 @@ community|Alghafa:multiple_choice_openbookqa_translated_task|5|1
 community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
 community|Alghafa:multiple_choice_rating_sentiment_task|5|1
 community|Alghafa:multiple_choice_sentiment_task|5|1
-lighteval|xstory_cloze:ar|0|0
+community|race_ar|5|1
+community|piqa_ar|5|1
+community|arc_easy_ar|5|1
+community|arc_challenge_okapi_ar|5|1
+community|openbook_qa_ext_ar|5|1
+community|boolq_ar|5|1
+community|copa_ext_ar|5|1
+community|hellaswag_okapi_ar|5|1
+community|toxigen_ar|5|1
+community|sciq_ar|5|1

From fafdc1b52705712519abd66bcaae3f381338b2fb Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 8 Mar 2024 16:16:39 +0100
Subject: [PATCH 06/27] Update arabic_evals.py

Add support to AlGhafa Translated benchmark suite (11 subsets)
---
 community_tasks/arabic_evals.py | 280 +++++++++++++++++++++++++++++++-
 1 file changed, 277 insertions(+), 3 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 2fc1a2807..72667d26d 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -29,6 +29,7 @@
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
+import re
 
 
 # fmt: off
@@ -199,12 +200,12 @@ def arabic_exams(line, task_name: str = None):
     )
 
 
-## ALGHAFA ##
+## ALGHAFA NATIVE ##
 # fmt: off
 ALGHAFA_SUBSETS = [
     "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", 
     "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", 
-    "multiple_choice_sentiment_task", # "multiple_choice_openbookqa_translated_task", "multiple_choice_copa_translated_task" ### TODO : clean up this later !
+    "multiple_choice_sentiment_task"
 ]
 # fmt: on
 
@@ -260,7 +261,280 @@ def Alghafa(line, task_name: str = None):
     )
 
 
-_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task]
+## ALGHAFA TRANSLATED ##
+# race_ar ##
+race_ar_task = LightevalTaskConfig(
+    name="race_ar",
+    prompt_function="Alghafa",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="race_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+
+# piqa_ar ##
+piqa_ar_task = LightevalTaskConfig(
+    name="piqa_ar",
+    prompt_function="Alghafa",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="piqa_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+
+# arc_easy_ar ##
+arc_easy_ar_task = LightevalTaskConfig(
+    name="arc_easy_ar",
+    prompt_function="Alghafa",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="arc_easy_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+
+# arc_challenge_okapi_ar ##
+arc_challenge_okapi_ar_task = LightevalTaskConfig(
+    name="arc_challenge_okapi_ar",
+    prompt_function="Alghafa",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="arc_challenge_okapi_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+
+# mmlu_okapi_ar ##
+mmlu_okapi_ar_task = LightevalTaskConfig(
+    name="mmlu_okapi_ar",
+    prompt_function="Alghafa",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="mmlu_okapi_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+
+# openbook_qa_ext_ar ##
+openbook_qa_ext_ar_task = LightevalTaskConfig(
+    name="openbook_qa_ext_ar",
+    prompt_function="Alghafa",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="openbook_qa_ext_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+
+# boolq_ar ##
+boolq_ar_task = LightevalTaskConfig(
+    name="boolq_ar",
+    prompt_function="boolq_function",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="boolq_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+def boolq_function(line, task_name: str = None):
+    question = line["question"]
+    passage = line["passage"]
+    answer = "نعم" if line["answer"] else "لا"
+
+    query = "بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ \"نعم\" أو \"لا\":\n{}\nالإجابة:".format(passage, question)
+    
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=["نعم", "لا"],
+        gold_index=0 if line["answer"] else 1,
+        instruction="",
+        target_for_fewshot_sorting=answer,
+    )
+
+
+# copa_ext_ar ##
+copa_ext_ar_task = LightevalTaskConfig(
+    name="copa_ext_ar",
+    prompt_function="copa_function",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="copa_ext_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+def copa_function(line, task_name: str = None):
+    premise = line["premise"]
+    choices = [line["choice1"], line["choice2"]]
+    question_map = {"cause": "لأن", "effect": "لذلك"}
+    question = question_map[line["question"]]
+    answer = line["label"]
+
+    query = "{}، {} :\n0) {}\n1) {}\nالإجابة:".format(premise, question, choices[0], choices[1])
+    
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=choices,
+        gold_index=answer,
+        instruction="",
+        target_for_fewshot_sorting=choices[answer],
+    )
+
+
+# hellaswag_okapi_ar ##
+hellaswag_okapi_ar_task = LightevalTaskConfig(
+    name="hellaswag_okapi_ar",
+    prompt_function="hellaswag_function",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="hellaswag_okapi_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+def hellaswag_function(line, task_name: str = None):
+    ctx = re.sub(r"\[.*?\]", "", line["ctx"])  # Remove latin words within brackets
+    endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])]  # endings is a string representation of a list
+    answer_index = line["label"]
+
+    query = "بناءً على السياق التالي:\n{}\n اختر النهاية الصحيحة من الاقتراحات التالية:\n".format(ctx)
+    for i, ending in enumerate(endings):
+        query += "{}) {}\n".format(i, ending)
+    query += "الإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=endings,
+        gold_index=answer_index,
+        instruction="",
+        target_for_fewshot_sorting=endings[answer_index],
+    )
+
+
+toxigen_ar_task = LightevalTaskConfig(
+    name="toxigen_ar",
+    prompt_function="toxigen_function",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="toxigen_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+def toxigen_function(line, task_name: str = None):
+    text = line["text"]
+    label = 1 if ((line['toxicity_ai'] + line['toxicity_human']) > 5.5) else 0
+    query = f"هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\".\n'{text}'\nالإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=["لا", "نعم"],
+        gold_index=label,
+        instruction="",
+        target_for_fewshot_sorting="نعم" if label == 1 else "لا",
+    )
+
+
+sciq_ar_task = LightevalTaskConfig(
+    name="sciq_ar",
+    prompt_function="sciq_function",
+    suite=["community"],
+    hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+    hf_subset="sciq_ar",
+    hf_avail_splits=["test", "validation"],
+    evaluation_splits=["test"],
+    few_shots_split="validation",
+    few_shots_select="sequential",
+    metric=["loglikelihood_acc_norm"],
+    # metric=["loglikelihood_acc"],
+    trust_dataset=True,
+)
+
+def sciq_function(line, task_name: str = None):
+    support = line["support"]
+    question = line["question"]
+    choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]]
+    answer_index = 3  # The label is always 3 for the correct answer
+
+    query = "بناءً على السياق التالي:\n{}\n اختر الإجابة الصحيحة من الاقتراحات التالية:\n".format(support)
+    for i, choice in enumerate(choices):
+        query += "{}) {}\n".format(i, choice)
+    query += "الإجابة:"
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=choices,
+        gold_index=answer_index,
+        instruction="",
+        target_for_fewshot_sorting=choices[answer_index],
+    )
+
+
+_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task] + [race_ar_task] + [piqa_ar_task] + [arc_easy_ar_task] + [arc_challenge_okapi_ar_task] + [mmlu_okapi_ar_task] + [openbook_qa_ext_ar_task] + [boolq_ar_task] + [copa_ext_ar_task] + [hellaswag_okapi_ar_task] + [toxigen_ar_task] + [sciq_ar_task]
 
 # Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]

From 9bb4da0ed59cffbc81293993c4c5f9f54eb8ff93 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 8 Mar 2024 16:57:15 +0100
Subject: [PATCH 07/27] Update arabic_evals.py

minor fixes flagged by the pre-commit hook
---
 community_tasks/arabic_evals.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 72667d26d..5c9f636ea 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -200,7 +200,7 @@ def arabic_exams(line, task_name: str = None):
     )
 
 
-## ALGHAFA NATIVE ##
+# ALGHAFA NATIVE ##
 # fmt: off
 ALGHAFA_SUBSETS = [
     "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", 
@@ -261,8 +261,8 @@ def Alghafa(line, task_name: str = None):
     )
 
 
-## ALGHAFA TRANSLATED ##
-# race_ar ##
+# ALGHAFA TRANSLATED ##
+# race_ar 
 race_ar_task = LightevalTaskConfig(
     name="race_ar",
     prompt_function="Alghafa",
@@ -279,7 +279,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# piqa_ar ##
+# piqa_ar 
 piqa_ar_task = LightevalTaskConfig(
     name="piqa_ar",
     prompt_function="Alghafa",
@@ -296,7 +296,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# arc_easy_ar ##
+# arc_easy_ar 
 arc_easy_ar_task = LightevalTaskConfig(
     name="arc_easy_ar",
     prompt_function="Alghafa",
@@ -313,7 +313,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# arc_challenge_okapi_ar ##
+# arc_challenge_okapi_ar
 arc_challenge_okapi_ar_task = LightevalTaskConfig(
     name="arc_challenge_okapi_ar",
     prompt_function="Alghafa",
@@ -330,7 +330,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# mmlu_okapi_ar ##
+# mmlu_okapi_ar 
 mmlu_okapi_ar_task = LightevalTaskConfig(
     name="mmlu_okapi_ar",
     prompt_function="Alghafa",
@@ -347,7 +347,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# openbook_qa_ext_ar ##
+# openbook_qa_ext_ar 
 openbook_qa_ext_ar_task = LightevalTaskConfig(
     name="openbook_qa_ext_ar",
     prompt_function="Alghafa",
@@ -364,7 +364,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# boolq_ar ##
+# boolq_ar 
 boolq_ar_task = LightevalTaskConfig(
     name="boolq_ar",
     prompt_function="boolq_function",
@@ -397,7 +397,7 @@ def boolq_function(line, task_name: str = None):
     )
 
 
-# copa_ext_ar ##
+# copa_ext_ar 
 copa_ext_ar_task = LightevalTaskConfig(
     name="copa_ext_ar",
     prompt_function="copa_function",
@@ -432,7 +432,7 @@ def copa_function(line, task_name: str = None):
     )
 
 
-# hellaswag_okapi_ar ##
+# hellaswag_okapi_ar 
 hellaswag_okapi_ar_task = LightevalTaskConfig(
     name="hellaswag_okapi_ar",
     prompt_function="hellaswag_function",
@@ -468,6 +468,7 @@ def hellaswag_function(line, task_name: str = None):
     )
 
 
+# toxigen_ar 
 toxigen_ar_task = LightevalTaskConfig(
     name="toxigen_ar",
     prompt_function="toxigen_function",
@@ -498,6 +499,7 @@ def toxigen_function(line, task_name: str = None):
     )
 
 
+# sciq_ar
 sciq_ar_task = LightevalTaskConfig(
     name="sciq_ar",
     prompt_function="sciq_function",
@@ -519,10 +521,10 @@ def sciq_function(line, task_name: str = None):
     choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]]
     answer_index = 3  # The label is always 3 for the correct answer
 
-    query = "بناءً على السياق التالي:\n{}\n اختر الإجابة الصحيحة من الاقتراحات التالية:\n".format(support)
+    query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format(support, question)
     for i, choice in enumerate(choices):
-        query += "{}) {}\n".format(i, choice)
-    query += "الإجابة:"
+        query += "\n{}) {}".format(i, choice)
+    query += "\nالإجابة:"
 
     return Doc(
         task_name=task_name,

From 3298ab5cc98529a0510fbd68b24d0302fbbf576f Mon Sep 17 00:00:00 2001
From: alielfilali01 <alielfilali0909@gmail.com>
Date: Fri, 8 Mar 2024 15:59:04 +0000
Subject: [PATCH 08/27] fix checks

---
 auto_commit_fixes.sh | 14 ++++++++++++++
 tests/__init__.py    |  1 -
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
new file mode 100755
index 000000000..dacc45864
--- /dev/null
+++ b/auto_commit_fixes.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Run pre-commit on all files
+pre-commit run --all-files
+
+# Check if there are changes that need to be staged and committed
+if ! git diff --quiet; then
+    echo "Fixing inconsistencies and committing..."
+    git add .
+    git commit -m "fix checks"
+    git push origin main
+else
+    echo "No changes detected."
+fi
diff --git a/tests/__init__.py b/tests/__init__.py
index 04980c23f..a732db8d0 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -19,4 +19,3 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-

From ad1ee556c587d9c013b77469eda4708c726a7b7a Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 8 Mar 2024 17:13:47 +0100
Subject: [PATCH 09/27] Update OALL_tasks.txt

forgot to remove
`community|Alghafa:multiple_choice_copa_translated_task|5|1`
& `community|Alghafa:multiple_choice_openbookqa_translated_task|5|1` from ALGHAFA NATIVE
---
 tasks_examples/OALL_tasks.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt
index 549039e18..e86fbae1c 100644
--- a/tasks_examples/OALL_tasks.txt
+++ b/tasks_examples/OALL_tasks.txt
@@ -118,11 +118,9 @@ community|acva:entertainment|5|1
 community|Alghafa:mcq_exams_test_ar|5|1
 community|Alghafa:meta_ar_dialects|5|1
 community|Alghafa:meta_ar_msa|5|1
-community|Alghafa:multiple_choice_copa_translated_task|5|1
 community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
 community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1
 community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
-community|Alghafa:multiple_choice_openbookqa_translated_task|5|1
 community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
 community|Alghafa:multiple_choice_rating_sentiment_task|5|1
 community|Alghafa:multiple_choice_sentiment_task|5|1

From a9a25377a01c56b10942e9f077062683ff0a18ed Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 8 Mar 2024 17:14:12 +0100
Subject: [PATCH 10/27] Update all_arabic_tasks.txt

forgot to remove
`community|Alghafa:multiple_choice_copa_translated_task|5|1`
& `community|Alghafa:multiple_choice_openbookqa_translated_task|5|1` from ALGHAFA NATIVE
---
 tasks_examples/all_arabic_tasks.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tasks_examples/all_arabic_tasks.txt b/tasks_examples/all_arabic_tasks.txt
index 578f934a5..2856a34a4 100644
--- a/tasks_examples/all_arabic_tasks.txt
+++ b/tasks_examples/all_arabic_tasks.txt
@@ -118,11 +118,9 @@ community|acva:entertainment|5|1
 community|Alghafa:mcq_exams_test_ar|5|1
 community|Alghafa:meta_ar_dialects|5|1
 community|Alghafa:meta_ar_msa|5|1
-community|Alghafa:multiple_choice_copa_translated_task|5|1
 community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
 community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1
 community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
-community|Alghafa:multiple_choice_openbookqa_translated_task|5|1
 community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
 community|Alghafa:multiple_choice_rating_sentiment_task|5|1
 community|Alghafa:multiple_choice_sentiment_task|5|1

From 55e27bbbdca38696e5647d5d970fe3aa7c4389d8 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 8 Mar 2024 17:37:25 +0100
Subject: [PATCH 11/27] Delete auto_commit_fixes.sh

no need
---
 auto_commit_fixes.sh | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
deleted file mode 100755
index dacc45864..000000000
--- a/auto_commit_fixes.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# Run pre-commit on all files
-pre-commit run --all-files
-
-# Check if there are changes that need to be staged and committed
-if ! git diff --quiet; then
-    echo "Fixing inconsistencies and committing..."
-    git add .
-    git commit -m "fix checks"
-    git push origin main
-else
-    echo "No changes detected."
-fi

From fea2cec365138782edab5eb30a06b3f387405fdc Mon Sep 17 00:00:00 2001
From: alielfilali01 <alielfilali0909@gmail.com>
Date: Fri, 8 Mar 2024 17:43:05 +0000
Subject: [PATCH 12/27] fix checks

---
 auto_commit_fixes.sh            | 17 +++++++++
 community_tasks/arabic_evals.py | 66 +++++++++++++++++++++++----------
 2 files changed, 63 insertions(+), 20 deletions(-)
 create mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
new file mode 100755
index 000000000..c4f93e9ba
--- /dev/null
+++ b/auto_commit_fixes.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Run pre-commit on all files
+pre-commit run --all-files
+
+# Run make style as suggested by Clémentine
+make style
+
+# Check if there are changes that need to be staged and committed
+if ! git diff --quiet; then
+    echo "Fixing inconsistencies and committing..."
+    git add .
+    git commit -m "fix checks"
+    git push origin main
+else
+    echo "No changes detected."
+fi
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 5c9f636ea..2e2538dfc 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -26,10 +26,11 @@
 
 This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
 """
+import re
+
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
-import re
 
 
 # fmt: off
@@ -203,8 +204,8 @@ def arabic_exams(line, task_name: str = None):
 # ALGHAFA NATIVE ##
 # fmt: off
 ALGHAFA_SUBSETS = [
-    "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", 
-    "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", 
+    "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task",
+    "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task",
     "multiple_choice_sentiment_task"
 ]
 # fmt: on
@@ -262,7 +263,7 @@ def Alghafa(line, task_name: str = None):
 
 
 # ALGHAFA TRANSLATED ##
-# race_ar 
+# race_ar
 race_ar_task = LightevalTaskConfig(
     name="race_ar",
     prompt_function="Alghafa",
@@ -279,7 +280,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# piqa_ar 
+# piqa_ar
 piqa_ar_task = LightevalTaskConfig(
     name="piqa_ar",
     prompt_function="Alghafa",
@@ -296,7 +297,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# arc_easy_ar 
+# arc_easy_ar
 arc_easy_ar_task = LightevalTaskConfig(
     name="arc_easy_ar",
     prompt_function="Alghafa",
@@ -330,7 +331,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# mmlu_okapi_ar 
+# mmlu_okapi_ar
 mmlu_okapi_ar_task = LightevalTaskConfig(
     name="mmlu_okapi_ar",
     prompt_function="Alghafa",
@@ -347,7 +348,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# openbook_qa_ext_ar 
+# openbook_qa_ext_ar
 openbook_qa_ext_ar_task = LightevalTaskConfig(
     name="openbook_qa_ext_ar",
     prompt_function="Alghafa",
@@ -364,7 +365,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-# boolq_ar 
+# boolq_ar
 boolq_ar_task = LightevalTaskConfig(
     name="boolq_ar",
     prompt_function="boolq_function",
@@ -380,13 +381,14 @@ def Alghafa(line, task_name: str = None):
     trust_dataset=True,
 )
 
+
 def boolq_function(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
     answer = "نعم" if line["answer"] else "لا"
 
-    query = "بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ \"نعم\" أو \"لا\":\n{}\nالإجابة:".format(passage, question)
-    
+    query = 'بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ "نعم" أو "لا":\n{}\nالإجابة:'.format(passage, question)
+
     return Doc(
         task_name=task_name,
         query=query,
@@ -397,7 +399,7 @@ def boolq_function(line, task_name: str = None):
     )
 
 
-# copa_ext_ar 
+# copa_ext_ar
 copa_ext_ar_task = LightevalTaskConfig(
     name="copa_ext_ar",
     prompt_function="copa_function",
@@ -413,6 +415,7 @@ def boolq_function(line, task_name: str = None):
     trust_dataset=True,
 )
 
+
 def copa_function(line, task_name: str = None):
     premise = line["premise"]
     choices = [line["choice1"], line["choice2"]]
@@ -421,7 +424,7 @@ def copa_function(line, task_name: str = None):
     answer = line["label"]
 
     query = "{}، {} :\n0) {}\n1) {}\nالإجابة:".format(premise, question, choices[0], choices[1])
-    
+
     return Doc(
         task_name=task_name,
         query=query,
@@ -432,7 +435,7 @@ def copa_function(line, task_name: str = None):
     )
 
 
-# hellaswag_okapi_ar 
+# hellaswag_okapi_ar
 hellaswag_okapi_ar_task = LightevalTaskConfig(
     name="hellaswag_okapi_ar",
     prompt_function="hellaswag_function",
@@ -448,9 +451,12 @@ def copa_function(line, task_name: str = None):
     trust_dataset=True,
 )
 
+
 def hellaswag_function(line, task_name: str = None):
     ctx = re.sub(r"\[.*?\]", "", line["ctx"])  # Remove latin words within brackets
-    endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])]  # endings is a string representation of a list
+    endings = [
+        re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])
+    ]  # endings is a string representation of a list
     answer_index = line["label"]
 
     query = "بناءً على السياق التالي:\n{}\n اختر النهاية الصحيحة من الاقتراحات التالية:\n".format(ctx)
@@ -468,7 +474,7 @@ def hellaswag_function(line, task_name: str = None):
     )
 
 
-# toxigen_ar 
+# toxigen_ar
 toxigen_ar_task = LightevalTaskConfig(
     name="toxigen_ar",
     prompt_function="toxigen_function",
@@ -484,10 +490,11 @@ def hellaswag_function(line, task_name: str = None):
     trust_dataset=True,
 )
 
+
 def toxigen_function(line, task_name: str = None):
     text = line["text"]
-    label = 1 if ((line['toxicity_ai'] + line['toxicity_human']) > 5.5) else 0
-    query = f"هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\".\n'{text}'\nالإجابة:"
+    label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0
+    query = f'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".\n\'{text}\'\nالإجابة:'
 
     return Doc(
         task_name=task_name,
@@ -515,13 +522,16 @@ def toxigen_function(line, task_name: str = None):
     trust_dataset=True,
 )
 
+
 def sciq_function(line, task_name: str = None):
     support = line["support"]
     question = line["question"]
     choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]]
     answer_index = 3  # The label is always 3 for the correct answer
 
-    query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format(support, question)
+    query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format(
+        support, question
+    )
     for i, choice in enumerate(choices):
         query += "\n{}) {}".format(i, choice)
     query += "\nالإجابة:"
@@ -536,7 +546,23 @@ def sciq_function(line, task_name: str = None):
     )
 
 
-_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task] + [race_ar_task] + [piqa_ar_task] + [arc_easy_ar_task] + [arc_challenge_okapi_ar_task] + [mmlu_okapi_ar_task] + [openbook_qa_ext_ar_task] + [boolq_ar_task] + [copa_ext_ar_task] + [hellaswag_okapi_ar_task] + [toxigen_ar_task] + [sciq_ar_task]
+_TASKS = (
+    ARABIC_MMLU_TASKS
+    + ACVA_TASKS
+    + ALGHAFA_TASKS
+    + [arabic_exams_task]
+    + [race_ar_task]
+    + [piqa_ar_task]
+    + [arc_easy_ar_task]
+    + [arc_challenge_okapi_ar_task]
+    + [mmlu_okapi_ar_task]
+    + [openbook_qa_ext_ar_task]
+    + [boolq_ar_task]
+    + [copa_ext_ar_task]
+    + [hellaswag_okapi_ar_task]
+    + [toxigen_ar_task]
+    + [sciq_ar_task]
+)
 
 # Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]

From d6646f90f7ca7f8ba2fcabe32fa37e2c7414f066 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:00:20 +0000
Subject: [PATCH 13/27] Delete auto_commit_fixes.sh

---
 auto_commit_fixes.sh | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
deleted file mode 100755
index c4f93e9ba..000000000
--- a/auto_commit_fixes.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-# Run pre-commit on all files
-pre-commit run --all-files
-
-# Run make style as suggested by Clémentine
-make style
-
-# Check if there are changes that need to be staged and committed
-if ! git diff --quiet; then
-    echo "Fixing inconsistencies and committing..."
-    git add .
-    git commit -m "fix checks"
-    git push origin main
-else
-    echo "No changes detected."
-fi

From 306a5f3528fc2c6d587b1d05635e61b67ba8dcea Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:49:08 +0000
Subject: [PATCH 14/27] Update arabic_evals.py

homogeneize naming according to the following comments :

####
Prompt names such as boolq_function will be unclear long term. For such functions, you could either use boolq_prompt_arabic or just boolq_arabic. (You need to specify the language since there is already a boolq prompt function by default.)

You also need to homogeneize Alghafa, which exists with several different casings, and fit it to Python style casing. For the prompt fonction, I'd keep it as alghafa_prompt or alghafa, for the class, CustomAlGhafaTask, and here for the name I'd keep it lower case
[CustomAlGhafaTask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS]
####
---
 community_tasks/arabic_evals.py | 38 ++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 2e2538dfc..710477be9 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -220,7 +220,7 @@ def __init__(
         super().__init__(
             name=name,
             hf_subset=hf_subset,
-            prompt_function="Alghafa",
+            prompt_function="alghafa_prompt",
             hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
             metric=["loglikelihood_acc_norm"],
             # metric=["loglikelihood_acc"],
@@ -236,10 +236,10 @@ def __init__(
         )
 
 
-ALGHAFA_TASKS = [CustomALGHAFATask(name=f"Alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS]
+ALGHAFA_TASKS = [CustomALGHAFATask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS]
 
 
-def Alghafa(line, task_name: str = None):
+def alghafa_prompt(line, task_name: str = None):
     question = line["query"]
     answer_index = int(line["label"])
     # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label'
@@ -266,7 +266,7 @@ def Alghafa(line, task_name: str = None):
 # race_ar
 race_ar_task = LightevalTaskConfig(
     name="race_ar",
-    prompt_function="Alghafa",
+    prompt_function="alghafa_prompt",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="race_ar",
@@ -283,7 +283,7 @@ def Alghafa(line, task_name: str = None):
 # piqa_ar
 piqa_ar_task = LightevalTaskConfig(
     name="piqa_ar",
-    prompt_function="Alghafa",
+    prompt_function="alghafa_prompt",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="piqa_ar",
@@ -300,7 +300,7 @@ def Alghafa(line, task_name: str = None):
 # arc_easy_ar
 arc_easy_ar_task = LightevalTaskConfig(
     name="arc_easy_ar",
-    prompt_function="Alghafa",
+    prompt_function="alghafa_prompt",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="arc_easy_ar",
@@ -317,7 +317,7 @@ def Alghafa(line, task_name: str = None):
 # arc_challenge_okapi_ar
 arc_challenge_okapi_ar_task = LightevalTaskConfig(
     name="arc_challenge_okapi_ar",
-    prompt_function="Alghafa",
+    prompt_function="alghafa_prompt",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="arc_challenge_okapi_ar",
@@ -334,7 +334,7 @@ def Alghafa(line, task_name: str = None):
 # mmlu_okapi_ar
 mmlu_okapi_ar_task = LightevalTaskConfig(
     name="mmlu_okapi_ar",
-    prompt_function="Alghafa",
+    prompt_function="alghafa_prompt",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="mmlu_okapi_ar",
@@ -351,7 +351,7 @@ def Alghafa(line, task_name: str = None):
 # openbook_qa_ext_ar
 openbook_qa_ext_ar_task = LightevalTaskConfig(
     name="openbook_qa_ext_ar",
-    prompt_function="Alghafa",
+    prompt_function="alghafa_prompt",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="openbook_qa_ext_ar",
@@ -368,7 +368,7 @@ def Alghafa(line, task_name: str = None):
 # boolq_ar
 boolq_ar_task = LightevalTaskConfig(
     name="boolq_ar",
-    prompt_function="boolq_function",
+    prompt_function="boolq_prompt_arabic",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="boolq_ar",
@@ -382,7 +382,7 @@ def Alghafa(line, task_name: str = None):
 )
 
 
-def boolq_function(line, task_name: str = None):
+def boolq_prompt_arabic(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
     answer = "نعم" if line["answer"] else "لا"
@@ -402,7 +402,7 @@ def boolq_function(line, task_name: str = None):
 # copa_ext_ar
 copa_ext_ar_task = LightevalTaskConfig(
     name="copa_ext_ar",
-    prompt_function="copa_function",
+    prompt_function="copa_prompt_arabic",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="copa_ext_ar",
@@ -416,7 +416,7 @@ def boolq_function(line, task_name: str = None):
 )
 
 
-def copa_function(line, task_name: str = None):
+def copa_prompt_arabic(line, task_name: str = None):
     premise = line["premise"]
     choices = [line["choice1"], line["choice2"]]
     question_map = {"cause": "لأن", "effect": "لذلك"}
@@ -438,7 +438,7 @@ def copa_function(line, task_name: str = None):
 # hellaswag_okapi_ar
 hellaswag_okapi_ar_task = LightevalTaskConfig(
     name="hellaswag_okapi_ar",
-    prompt_function="hellaswag_function",
+    prompt_function="hellaswag_prompt_arabic",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="hellaswag_okapi_ar",
@@ -452,7 +452,7 @@ def copa_function(line, task_name: str = None):
 )
 
 
-def hellaswag_function(line, task_name: str = None):
+def hellaswag_prompt_arabic(line, task_name: str = None):
     ctx = re.sub(r"\[.*?\]", "", line["ctx"])  # Remove latin words within brackets
     endings = [
         re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])
@@ -477,7 +477,7 @@ def hellaswag_function(line, task_name: str = None):
 # toxigen_ar
 toxigen_ar_task = LightevalTaskConfig(
     name="toxigen_ar",
-    prompt_function="toxigen_function",
+    prompt_function="toxigen_prompt_arabic",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="toxigen_ar",
@@ -491,7 +491,7 @@ def hellaswag_function(line, task_name: str = None):
 )
 
 
-def toxigen_function(line, task_name: str = None):
+def toxigen_prompt_arabic(line, task_name: str = None):
     text = line["text"]
     label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0
     query = f'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".\n\'{text}\'\nالإجابة:'
@@ -509,7 +509,7 @@ def toxigen_function(line, task_name: str = None):
 # sciq_ar
 sciq_ar_task = LightevalTaskConfig(
     name="sciq_ar",
-    prompt_function="sciq_function",
+    prompt_function="sciq_prompt_arabic",
     suite=["community"],
     hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
     hf_subset="sciq_ar",
@@ -523,7 +523,7 @@ def toxigen_function(line, task_name: str = None):
 )
 
 
-def sciq_function(line, task_name: str = None):
+def sciq_prompt_arabic(line, task_name: str = None):
     support = line["support"]
     question = line["question"]
     choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]]

From 2fbad52dcca7104164e9c16e05572245e11da070 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:49:54 +0000
Subject: [PATCH 15/27] Update OALL_tasks.txt

homogeneize AlGhafa naming : `Alghafa` to `alghafa`
---
 tasks_examples/OALL_tasks.txt | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt
index e86fbae1c..346d062c6 100644
--- a/tasks_examples/OALL_tasks.txt
+++ b/tasks_examples/OALL_tasks.txt
@@ -115,15 +115,15 @@ community|acva:communication|5|1
 community|acva:computer_and_phone|5|1
 community|acva:daily_life|5|1
 community|acva:entertainment|5|1
-community|Alghafa:mcq_exams_test_ar|5|1
-community|Alghafa:meta_ar_dialects|5|1
-community|Alghafa:meta_ar_msa|5|1
-community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
-community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1
-community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
-community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
-community|Alghafa:multiple_choice_rating_sentiment_task|5|1
-community|Alghafa:multiple_choice_sentiment_task|5|1
+community|alghafa:mcq_exams_test_ar|5|1
+community|alghafa:meta_ar_dialects|5|1
+community|alghafa:meta_ar_msa|5|1
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
+community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
+community|alghafa:multiple_choice_rating_sentiment_task|5|1
+community|alghafa:multiple_choice_sentiment_task|5|1
 community|race_ar|5|1
 community|piqa_ar|5|1
 community|arc_easy_ar|5|1

From f3724035399e8e6bbe53f12f72c8ebe93cc95079 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:50:08 +0000
Subject: [PATCH 16/27] Update all_arabic_tasks.txt

homogeneize AlGhafa naming : `Alghafa` to `alghafa`
---
 tasks_examples/all_arabic_tasks.txt | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tasks_examples/all_arabic_tasks.txt b/tasks_examples/all_arabic_tasks.txt
index 2856a34a4..fa430ed14 100644
--- a/tasks_examples/all_arabic_tasks.txt
+++ b/tasks_examples/all_arabic_tasks.txt
@@ -115,15 +115,15 @@ community|acva:communication|5|1
 community|acva:computer_and_phone|5|1
 community|acva:daily_life|5|1
 community|acva:entertainment|5|1
-community|Alghafa:mcq_exams_test_ar|5|1
-community|Alghafa:meta_ar_dialects|5|1
-community|Alghafa:meta_ar_msa|5|1
-community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
-community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1
-community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
-community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
-community|Alghafa:multiple_choice_rating_sentiment_task|5|1
-community|Alghafa:multiple_choice_sentiment_task|5|1
+community|alghafa:mcq_exams_test_ar|5|1
+community|alghafa:meta_ar_dialects|5|1
+community|alghafa:meta_ar_msa|5|1
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1
+community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1
+community|alghafa:multiple_choice_rating_sentiment_task|5|1
+community|alghafa:multiple_choice_sentiment_task|5|1
 community|race_ar|5|1
 community|piqa_ar|5|1
 community|arc_easy_ar|5|1

From 2c492f6d6ebc42d9013f7620650783d99cc49cc4 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:57:37 +0000
Subject: [PATCH 17/27] Update community_tasks/arabic_evals.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 community_tasks/arabic_evals.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 710477be9..1c2a25039 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -223,7 +223,6 @@ def __init__(
             prompt_function="alghafa_prompt",
             hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
             metric=["loglikelihood_acc_norm"],
-            # metric=["loglikelihood_acc"],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",

From e7ddfb547319f02aae80232d504fc5eeb2a99b4b Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:57:44 +0000
Subject: [PATCH 18/27] Update community_tasks/arabic_evals.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 community_tasks/arabic_evals.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 1c2a25039..664a622f7 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -173,7 +173,6 @@ def acva(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 

From f7278c19729245eed17f3f00acdbebc377e93448 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:57:55 +0000
Subject: [PATCH 19/27] Update community_tasks/arabic_evals.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 community_tasks/arabic_evals.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 664a622f7..f330d47b5 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -64,7 +64,6 @@ def __init__(
             prompt_function="mmlu_arabic",
             hf_repo="OALL/Arabic_MMLU",
             metric=["loglikelihood_acc_norm"],
-            # metric=["loglikelihood_acc"],
             hf_avail_splits=["test", "dev"],
             evaluation_splits=["test"],
             few_shots_split="dev",

From f186ded6b94c980cea08e28aa27666b31b7b0404 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:58:03 +0000
Subject: [PATCH 20/27] Update community_tasks/arabic_evals.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 community_tasks/arabic_evals.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index f330d47b5..d8af5db6c 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -131,7 +131,6 @@ def __init__(
             prompt_function="acva",
             hf_repo="OALL/ACVA",
             metric=["loglikelihood_acc_norm"],
-            # metric=["loglikelihood_acc"],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",

From 6be58045286ae3c447c74b38442a271ebbb51acc Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Thu, 14 Mar 2024 13:22:30 +0000
Subject: [PATCH 21/27] Update community_tasks/arabic_evals.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

use the standard camel casing for classes:

(remove) class CustomALGHAFATask(LightevalTaskConfig):

(add) class CustomAlGhafaTask(LightevalTaskConfig):

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 community_tasks/arabic_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index d8af5db6c..3bc61fbf7 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -208,7 +208,7 @@ def arabic_exams(line, task_name: str = None):
 # fmt: on
 
 
-class CustomALGHAFATask(LightevalTaskConfig):
+class CustomAlGhafaTask(LightevalTaskConfig):
     def __init__(
         self,
         name,

From e65e026785dd410c47850f507cbbe179cfaf1d4b Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Thu, 14 Mar 2024 14:05:46 +0000
Subject: [PATCH 22/27] Update arabic_evals.py

Fixes based on Clementine's comments
---
 community_tasks/arabic_evals.py | 95 +++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 34 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 3bc61fbf7..e456e2b45 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -27,6 +27,7 @@
 This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
 """
 import re
+import random
 
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
@@ -208,7 +209,7 @@ def arabic_exams(line, task_name: str = None):
 # fmt: on
 
 
-class CustomAlGhafaTask(LightevalTaskConfig):
+class CustomAlGhafaNativeTask(LightevalTaskConfig):
     def __init__(
         self,
         name,
@@ -232,7 +233,7 @@ def __init__(
         )
 
 
-ALGHAFA_TASKS = [CustomALGHAFATask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS]
+ALGHAFA_TASKS = [CustomAlGhafaNativeTask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS]
 
 
 def alghafa_prompt(line, task_name: str = None):
@@ -271,7 +272,6 @@ def alghafa_prompt(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -288,7 +288,6 @@ def alghafa_prompt(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -305,7 +304,6 @@ def alghafa_prompt(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -322,7 +320,6 @@ def alghafa_prompt(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -339,7 +336,6 @@ def alghafa_prompt(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -356,7 +352,6 @@ def alghafa_prompt(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -373,7 +368,6 @@ def alghafa_prompt(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -382,15 +376,22 @@ def boolq_prompt_arabic(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
     answer = "نعم" if line["answer"] else "لا"
-
-    query = 'بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ "نعم" أو "لا":\n{}\nالإجابة:'.format(passage, question)
+    instruction = "بناء على المقطع التالي، أجب عن السؤال ب \"نعم\" أو \"لا\""
+    query = f"""
+    {instruction}
+    المقطع :
+    {passage}
+    السؤال:
+    {question}
+    الإجابة:
+    """
 
     return Doc(
         task_name=task_name,
         query=query,
         choices=["نعم", "لا"],
         gold_index=0 if line["answer"] else 1,
-        instruction="",
+        instruction=instruction,
         target_for_fewshot_sorting=answer,
     )
 
@@ -407,7 +408,6 @@ def boolq_prompt_arabic(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -443,21 +443,25 @@ def copa_prompt_arabic(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
 
 def hellaswag_prompt_arabic(line, task_name: str = None):
     ctx = re.sub(r"\[.*?\]", "", line["ctx"])  # Remove latin words within brackets
-    endings = [
-        re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])
-    ]  # endings is a string representation of a list
+    endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])]  # endings is a string representation of a list
     answer_index = line["label"]
-
-    query = "بناءً على السياق التالي:\n{}\n اختر النهاية الصحيحة من الاقتراحات التالية:\n".format(ctx)
+    instruction = "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية"
+
+    query = f"""
+    {instruction}
+    السياق:
+    {ctx}
+    الاقتراحات:
+    
+    """
     for i, ending in enumerate(endings):
-        query += "{}) {}\n".format(i, ending)
+        query += f"{i}) {ending}\n"
     query += "الإجابة:"
 
     return Doc(
@@ -465,7 +469,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
         query=query,
         choices=endings,
         gold_index=answer_index,
-        instruction="",
+        instruction=instruction,
         target_for_fewshot_sorting=endings[answer_index],
     )
 
@@ -482,7 +486,6 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -490,14 +493,21 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
 def toxigen_prompt_arabic(line, task_name: str = None):
     text = line["text"]
     label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0
-    query = f'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".\n\'{text}\'\nالإجابة:'
+    instruction = "هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\"."
+
+    query = f"""
+    {instruction}
+    العبارة:
+    '{text}'
+    الإجابة:
+    """
 
     return Doc(
         task_name=task_name,
         query=query,
         choices=["لا", "نعم"],
         gold_index=label,
-        instruction="",
+        instruction=instruction,
         target_for_fewshot_sorting="نعم" if label == 1 else "لا",
     )
 
@@ -514,7 +524,6 @@ def toxigen_prompt_arabic(line, task_name: str = None):
     few_shots_split="validation",
     few_shots_select="sequential",
     metric=["loglikelihood_acc_norm"],
-    # metric=["loglikelihood_acc"],
     trust_dataset=True,
 )
 
@@ -522,22 +531,40 @@ def toxigen_prompt_arabic(line, task_name: str = None):
 def sciq_prompt_arabic(line, task_name: str = None):
     support = line["support"]
     question = line["question"]
-    choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]]
-    answer_index = 3  # The label is always 3 for the correct answer
-
-    query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format(
-        support, question
-    )
+    correct_answer = line["correct_answer"]
+    choices = [
+        line["distractor1"],
+        line["distractor2"],
+        line["distractor3"],
+        correct_answer
+    ]
+    
+    # Shuffle the choices
+    random.shuffle(choices)
+    
+    answer_index = choices.index(correct_answer)
+    
+    instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات"
+
+    query = f"""
+    {instruction}
+    السياق:
+    {support}
+    السؤال:
+    {question}
+    الإجابات المحتملة:
+    
+    """
     for i, choice in enumerate(choices):
-        query += "\n{}) {}".format(i, choice)
-    query += "\nالإجابة:"
+        query += f"{i}) {choice}\n"
+    query += "الإجابة:"
 
     return Doc(
         task_name=task_name,
         query=query,
         choices=choices,
         gold_index=answer_index,
-        instruction="",
+        instruction=instruction,
         target_for_fewshot_sorting=choices[answer_index],
     )
 

From c0e1a3f38d3a62614aaf1b2949eab12f188dde5f Mon Sep 17 00:00:00 2001
From: alielfilali01 <alielfilali0909@gmail.com>
Date: Thu, 14 Mar 2024 14:14:49 +0000
Subject: [PATCH 23/27] fix checks

---
 auto_commit_fixes.sh            | 17 +++++++++++++++++
 community_tasks/arabic_evals.py | 27 ++++++++++++---------------
 2 files changed, 29 insertions(+), 15 deletions(-)
 create mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
new file mode 100755
index 000000000..c4f93e9ba
--- /dev/null
+++ b/auto_commit_fixes.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Run pre-commit on all files
+pre-commit run --all-files
+
+# Run make style as suggested by Clémentine
+make style
+
+# Check if there are changes that need to be staged and committed
+if ! git diff --quiet; then
+    echo "Fixing inconsistencies and committing..."
+    git add .
+    git commit -m "fix checks"
+    git push origin main
+else
+    echo "No changes detected."
+fi
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index e456e2b45..d420476de 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -26,8 +26,8 @@
 
 This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
 """
-import re
 import random
+import re
 
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
@@ -376,7 +376,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
     answer = "نعم" if line["answer"] else "لا"
-    instruction = "بناء على المقطع التالي، أجب عن السؤال ب \"نعم\" أو \"لا\""
+    instruction = 'بناء على المقطع التالي، أجب عن السؤال ب "نعم" أو "لا"'
     query = f"""
     {instruction}
     المقطع :
@@ -449,7 +449,9 @@ def copa_prompt_arabic(line, task_name: str = None):
 
 def hellaswag_prompt_arabic(line, task_name: str = None):
     ctx = re.sub(r"\[.*?\]", "", line["ctx"])  # Remove latin words within brackets
-    endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])]  # endings is a string representation of a list
+    endings = [
+        re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])
+    ]  # endings is a string representation of a list
     answer_index = line["label"]
     instruction = "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية"
 
@@ -458,7 +460,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
     السياق:
     {ctx}
     الاقتراحات:
-    
+
     """
     for i, ending in enumerate(endings):
         query += f"{i}) {ending}\n"
@@ -493,7 +495,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
 def toxigen_prompt_arabic(line, task_name: str = None):
     text = line["text"]
     label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0
-    instruction = "هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\"."
+    instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".'
 
     query = f"""
     {instruction}
@@ -532,18 +534,13 @@ def sciq_prompt_arabic(line, task_name: str = None):
     support = line["support"]
     question = line["question"]
     correct_answer = line["correct_answer"]
-    choices = [
-        line["distractor1"],
-        line["distractor2"],
-        line["distractor3"],
-        correct_answer
-    ]
-    
+    choices = [line["distractor1"], line["distractor2"], line["distractor3"], correct_answer]
+
     # Shuffle the choices
     random.shuffle(choices)
-    
+
     answer_index = choices.index(correct_answer)
-    
+
     instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات"
 
     query = f"""
@@ -553,7 +550,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
     السؤال:
     {question}
     الإجابات المحتملة:
-    
+
     """
     for i, choice in enumerate(choices):
         query += f"{i}) {choice}\n"

From 294ac498b250242b5dae9e0b8b5e2b351e6efe1b Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Thu, 14 Mar 2024 14:18:37 +0000
Subject: [PATCH 24/27] Delete auto_commit_fixes.sh

---
 auto_commit_fixes.sh | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
deleted file mode 100755
index c4f93e9ba..000000000
--- a/auto_commit_fixes.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-# Run pre-commit on all files
-pre-commit run --all-files
-
-# Run make style as suggested by Clémentine
-make style
-
-# Check if there are changes that need to be staged and committed
-if ! git diff --quiet; then
-    echo "Fixing inconsistencies and committing..."
-    git add .
-    git commit -m "fix checks"
-    git push origin main
-else
-    echo "No changes detected."
-fi

From 97bd39338335dd40efb633e405628e271643280d Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 26 Mar 2024 16:15:42 +0000
Subject: [PATCH 25/27] Update arabic_evals.py

Fix ValueError: Prompt query
---
 community_tasks/arabic_evals.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index d420476de..4fe59c2d6 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -376,9 +376,8 @@ def boolq_prompt_arabic(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
     answer = "نعم" if line["answer"] else "لا"
-    instruction = 'بناء على المقطع التالي، أجب عن السؤال ب "نعم" أو "لا"'
-    query = f"""
-    {instruction}
+    instruction = 'بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا'
+    query = f"""{instruction}
     المقطع :
     {passage}
     السؤال:
@@ -455,8 +454,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
     answer_index = line["label"]
     instruction = "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية"
 
-    query = f"""
-    {instruction}
+    query = f"""{instruction}
     السياق:
     {ctx}
     الاقتراحات:
@@ -497,8 +495,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
     label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0
     instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".'
 
-    query = f"""
-    {instruction}
+    query = f"""{instruction}
     العبارة:
     '{text}'
     الإجابة:
@@ -543,8 +540,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
 
     instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات"
 
-    query = f"""
-    {instruction}
+    query = f"""{instruction}
     السياق:
     {support}
     السؤال:

From e8cf58dce3a907605937d9513f88fe33b310cd86 Mon Sep 17 00:00:00 2001
From: alielfilali01 <alielfilali0909@gmail.com>
Date: Tue, 26 Mar 2024 17:01:19 +0000
Subject: [PATCH 26/27] fix checks

---
 auto_commit_fixes.sh            | 17 +++++++++++++++++
 community_tasks/arabic_evals.py |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
new file mode 100755
index 000000000..c4f93e9ba
--- /dev/null
+++ b/auto_commit_fixes.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Run pre-commit on all files
+pre-commit run --all-files
+
+# Run make style as suggested by Clémentine
+make style
+
+# Check if there are changes that need to be staged and committed
+if ! git diff --quiet; then
+    echo "Fixing inconsistencies and committing..."
+    git add .
+    git commit -m "fix checks"
+    git push origin main
+else
+    echo "No changes detected."
+fi
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 4fe59c2d6..60db04505 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -376,7 +376,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
     answer = "نعم" if line["answer"] else "لا"
-    instruction = 'بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا'
+    instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا"
     query = f"""{instruction}
     المقطع :
     {passage}

From be7f3b834c788273ac1ee23fd47349f3e8dfbe30 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Tue, 26 Mar 2024 17:02:53 +0000
Subject: [PATCH 27/27] Delete auto_commit_fixes.sh

---
 auto_commit_fixes.sh | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100755 auto_commit_fixes.sh

diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh
deleted file mode 100755
index c4f93e9ba..000000000
--- a/auto_commit_fixes.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-# Run pre-commit on all files
-pre-commit run --all-files
-
-# Run make style as suggested by Clémentine
-make style
-
-# Check if there are changes that need to be staged and committed
-if ! git diff --quiet; then
-    echo "Fixing inconsistencies and committing..."
-    git add .
-    git commit -m "fix checks"
-    git push origin main
-else
-    echo "No changes detected."
-fi