From 859c5f0ddf11dc6c9aa3065061d06823df35fffc Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 5 Mar 2024 17:32:34 +0100 Subject: [PATCH 01/27] Update arabic_evals.py Add Support for the AlGhafa benchmarking suite --- community_tasks/arabic_evals.py | 72 +++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 07ef6b327..4e5aefe7b 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -61,7 +61,8 @@ def __init__( hf_subset=hf_subset, prompt_function="mmlu_arabic", hf_repo="OALL/Arabic_MMLU", - metric=["loglikelihood_acc"], + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split="dev", @@ -128,7 +129,8 @@ def __init__( hf_subset=hf_subset, prompt_function="acva", hf_repo="OALL/ACVA", - metric=["loglikelihood_acc"], + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -168,7 +170,8 @@ def acva(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", - metric=["loglikelihood_acc"], + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -196,7 +199,68 @@ def arabic_exams(line, task_name: str = None): ) -_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + [arabic_exams_task] +## ALGHAFA ## +# fmt: off +ALGHAFA_SUBSETS = [ + "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_copa_translated_task", "multiple_choice_facts_truefalse_balanced_task", + "multiple_choice_grounded_statement_soqal_task", "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_openbookqa_translated_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", + "multiple_choice_sentiment_task" +] +# fmt: on + + +class CustomALGHAFATask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function="Alghafa", + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + ) + + +ALGHAFA_TASKS = [CustomALGHAFATask(name=f"Alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] + + +def Alghafa(line, task_name: str = None): + question = line["query"] + answer_index = int(line["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [key for key in line.keys() if key not in ["query", "label", "__few_shots"]] + choices = [line[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=choices, + gold_index=answer_index, + instruction=instruction, + target_for_fewshot_sorting=choices[answer_index], + ) + + +_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task] # Convert to dict for lighteval TASKS_TABLE = [task.as_dict() for task in _TASKS] From 6249e1f5b9df9df14cf34b90a4374cfaafaeb904 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 5 Mar 2024 17:37:12 +0100 Subject: [PATCH 02/27] Update OALL_tasks.txt Adding support to the AlGhafa benchmarking suite --- tasks_examples/OALL_tasks.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt index 5428fba49..f6ef495ea 100644 --- a/tasks_examples/OALL_tasks.txt +++ b/tasks_examples/OALL_tasks.txt @@ -114,4 +114,15 @@ community|acva:communication|5|1 community|acva:computer_and_phone|5|1 community|acva:daily_life|5|1 community|acva:entertainment|5|1 +community|Alghafa:mcq_exams_test_ar|5|1 +community|Alghafa:meta_ar_dialects|5|1 +community|Alghafa:meta_ar_msa|5|1 +community|Alghafa:multiple_choice_copa_translated_task|5|1 +community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 +community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1 +community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 +community|Alghafa:multiple_choice_openbookqa_translated_task|5|1 +community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 +community|Alghafa:multiple_choice_rating_sentiment_task|5|1 +community|Alghafa:multiple_choice_sentiment_task|5|1 lighteval|xstory_cloze:ar|0|0 From d30d1ed19ebb566e09cfad28f6ff2b7d81714453 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Wed, 6 Mar 2024 22:10:53 +0100 Subject: [PATCH 03/27] Update arabic_evals.py remove translated from AlGhafa --- community_tasks/arabic_evals.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 4e5aefe7b..2fc1a2807 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -202,9 +202,9 @@ def arabic_exams(line, task_name: str = None): ## ALGHAFA ## # fmt: off ALGHAFA_SUBSETS = [ - "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_copa_translated_task", "multiple_choice_facts_truefalse_balanced_task", - "multiple_choice_grounded_statement_soqal_task", "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_openbookqa_translated_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", - "multiple_choice_sentiment_task" + "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", + "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", + "multiple_choice_sentiment_task", # "multiple_choice_openbookqa_translated_task", "multiple_choice_copa_translated_task" ### TODO : clean up this later ! ] # fmt: on @@ -219,7 +219,7 @@ def __init__( name=name, hf_subset=hf_subset, prompt_function="Alghafa", - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark", + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", metric=["loglikelihood_acc_norm"], # metric=["loglikelihood_acc"], hf_avail_splits=["test", "validation"], From 7f1e657a0c47ac0201de52b71200b00856a3c73e Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Thu, 7 Mar 2024 23:31:46 +0100 Subject: [PATCH 04/27] Create all_arabic_tasks.txt This file now contains all the arabic tasks including tasks not present in OALL_tasks.txt --- tasks_examples/all_arabic_tasks.txt | 139 ++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 tasks_examples/all_arabic_tasks.txt diff --git a/tasks_examples/all_arabic_tasks.txt b/tasks_examples/all_arabic_tasks.txt new file mode 100644 index 000000000..578f934a5 --- /dev/null +++ b/tasks_examples/all_arabic_tasks.txt @@ -0,0 +1,139 @@ +lighteval|xstory_cloze:ar|0|0 +community|arabic_mmlu:abstract_algebra|5|1 +community|arabic_mmlu:anatomy|5|1 +community|arabic_mmlu:astronomy|5|1 +community|arabic_mmlu:business_ethics|5|1 +community|arabic_mmlu:clinical_knowledge|5|1 +community|arabic_mmlu:college_biology|5|1 +community|arabic_mmlu:college_chemistry|5|1 +community|arabic_mmlu:college_computer_science|5|1 +community|arabic_mmlu:college_mathematics|5|1 +community|arabic_mmlu:college_medicine|5|1 +community|arabic_mmlu:college_physics|5|1 +community|arabic_mmlu:computer_security|5|1 +community|arabic_mmlu:conceptual_physics|5|1 +community|arabic_mmlu:econometrics|5|1 +community|arabic_mmlu:electrical_engineering|5|1 +community|arabic_mmlu:elementary_mathematics|5|1 +community|arabic_mmlu:formal_logic|5|1 +community|arabic_mmlu:global_facts|5|1 +community|arabic_mmlu:high_school_biology|5|1 +community|arabic_mmlu:high_school_chemistry|5|1 +community|arabic_mmlu:high_school_computer_science|5|1 +community|arabic_mmlu:high_school_european_history|5|1 +community|arabic_mmlu:high_school_geography|5|1 +community|arabic_mmlu:high_school_government_and_politics|5|1 +community|arabic_mmlu:high_school_macroeconomics|5|1 +community|arabic_mmlu:high_school_mathematics|5|1 +community|arabic_mmlu:high_school_microeconomics|5|1 +community|arabic_mmlu:high_school_physics|5|1 +community|arabic_mmlu:high_school_psychology|5|1 +community|arabic_mmlu:high_school_statistics|5|1 +community|arabic_mmlu:high_school_us_history|5|1 +community|arabic_mmlu:high_school_world_history|5|1 +community|arabic_mmlu:human_aging|5|1 +community|arabic_mmlu:human_sexuality|5|1 +community|arabic_mmlu:international_law|5|1 +community|arabic_mmlu:jurisprudence|5|1 +community|arabic_mmlu:logical_fallacies|5|1 +community|arabic_mmlu:machine_learning|5|1 +community|arabic_mmlu:management|5|1 +community|arabic_mmlu:marketing|5|1 +community|arabic_mmlu:medical_genetics|5|1 +community|arabic_mmlu:miscellaneous|5|1 +community|arabic_mmlu:moral_disputes|5|1 +community|arabic_mmlu:moral_scenarios|5|1 +community|arabic_mmlu:nutrition|5|1 +community|arabic_mmlu:philosophy|5|1 +community|arabic_mmlu:prehistory|5|1 +community|arabic_mmlu:professional_accounting|5|1 +community|arabic_mmlu:professional_law|5|1 +community|arabic_mmlu:professional_medicine|5|1 +community|arabic_mmlu:professional_psychology|5|1 +community|arabic_mmlu:public_relations|5|1 +community|arabic_mmlu:security_studies|5|1 +community|arabic_mmlu:sociology|5|1 +community|arabic_mmlu:us_foreign_policy|5|1 +community|arabic_mmlu:virology|5|1 +community|arabic_mmlu:world_religions|5|1 +community|arabic_exams|5|1 +community|acva:Algeria|5|1 +community|acva:Ancient_Egypt|5|1 +community|acva:Arab_Empire|5|1 +community|acva:Arabic_Architecture|5|1 +community|acva:Arabic_Art|5|1 +community|acva:Arabic_Astronomy|5|1 +community|acva:Arabic_Calligraphy|5|1 +community|acva:Arabic_Ceremony|5|1 +community|acva:Arabic_Clothing|5|1 +community|acva:Arabic_Culture|5|1 +community|acva:Arabic_Food|5|1 +community|acva:Arabic_Funeral|5|1 +community|acva:Arabic_Geography|5|1 +community|acva:Arabic_History|5|1 +community|acva:Arabic_Language_Origin|5|1 +community|acva:Arabic_Literature|5|1 +community|acva:Arabic_Math|5|1 +community|acva:Arabic_Medicine|5|1 +community|acva:Arabic_Music|5|1 +community|acva:Arabic_Ornament|5|1 +community|acva:Arabic_Philosophy|5|1 +community|acva:Arabic_Physics_and_Chemistry|5|1 +community|acva:Arabic_Wedding|5|1 +community|acva:Bahrain|5|1 +community|acva:Comoros|5|1 +community|acva:Egypt_modern|5|1 +community|acva:InfluenceFromAncientEgypt|5|1 +community|acva:InfluenceFromByzantium|5|1 +community|acva:InfluenceFromChina|5|1 +community|acva:InfluenceFromGreece|5|1 +community|acva:InfluenceFromIslam|5|1 +community|acva:InfluenceFromPersia|5|1 +community|acva:InfluenceFromRome|5|1 +community|acva:Iraq|5|1 +community|acva:Islam_Education|5|1 +community|acva:Islam_branches_and_schools|5|1 +community|acva:Islamic_law_system|5|1 +community|acva:Jordan|5|1 +community|acva:Kuwait|5|1 +community|acva:Lebanon|5|1 +community|acva:Libya|5|1 +community|acva:Mauritania|5|1 +community|acva:Mesopotamia_civilization|5|1 +community|acva:Morocco|5|1 +community|acva:Oman|5|1 +community|acva:Palestine|5|1 +community|acva:Qatar|5|1 +community|acva:Saudi_Arabia|5|1 +community|acva:Somalia|5|1 +community|acva:Sudan|5|1 +community|acva:Syria|5|1 +community|acva:Tunisia|5|1 +community|acva:United_Arab_Emirates|5|1 +community|acva:Yemen|5|1 +community|acva:communication|5|1 +community|acva:computer_and_phone|5|1 +community|acva:daily_life|5|1 +community|acva:entertainment|5|1 +community|Alghafa:mcq_exams_test_ar|5|1 +community|Alghafa:meta_ar_dialects|5|1 +community|Alghafa:meta_ar_msa|5|1 +community|Alghafa:multiple_choice_copa_translated_task|5|1 +community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 +community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1 +community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 +community|Alghafa:multiple_choice_openbookqa_translated_task|5|1 +community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 +community|Alghafa:multiple_choice_rating_sentiment_task|5|1 +community|Alghafa:multiple_choice_sentiment_task|5|1 +community|race_ar|5|1 +community|piqa_ar|5|1 +community|arc_easy_ar|5|1 +community|arc_challenge_okapi_ar|5|1 +community|mmlu_okapi_ar|5|1 +community|openbook_qa_ext_ar|5|1 +community|boolq_ar|5|1 +community|copa_ext_ar|5|1 +community|hellaswag_okapi_ar|5|1 +community|toxigen_ar|5|1 +community|sciq_ar|5|1 From 129733baeb49fc350d01cccffd0c7c4574346684 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Thu, 7 Mar 2024 23:33:12 +0100 Subject: [PATCH 05/27] Update OALL_tasks.txt Add support for ALGHAFA TRANSLATED tasks --- tasks_examples/OALL_tasks.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt index f6ef495ea..549039e18 100644 --- a/tasks_examples/OALL_tasks.txt +++ b/tasks_examples/OALL_tasks.txt @@ -1,3 +1,4 @@ +lighteval|xstory_cloze:ar|0|0 community|arabic_mmlu:abstract_algebra|5|1 community|arabic_mmlu:anatomy|5|1 community|arabic_mmlu:astronomy|5|1 @@ -125,4 +126,13 @@ community|Alghafa:multiple_choice_openbookqa_translated_task|5|1 community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 community|Alghafa:multiple_choice_rating_sentiment_task|5|1 community|Alghafa:multiple_choice_sentiment_task|5|1 -lighteval|xstory_cloze:ar|0|0 +community|race_ar|5|1 +community|piqa_ar|5|1 +community|arc_easy_ar|5|1 +community|arc_challenge_okapi_ar|5|1 +community|openbook_qa_ext_ar|5|1 +community|boolq_ar|5|1 +community|copa_ext_ar|5|1 +community|hellaswag_okapi_ar|5|1 +community|toxigen_ar|5|1 +community|sciq_ar|5|1 From fafdc1b52705712519abd66bcaae3f381338b2fb Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:16:39 +0100 Subject: [PATCH 06/27] Update arabic_evals.py Add support to AlGhafa Translated benchmark suite (11 subsets) --- community_tasks/arabic_evals.py | 280 +++++++++++++++++++++++++++++++- 1 file changed, 277 insertions(+), 3 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 2fc1a2807..72667d26d 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -29,6 +29,7 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES +import re # fmt: off @@ -199,12 +200,12 @@ def arabic_exams(line, task_name: str = None): ) -## ALGHAFA ## +## ALGHAFA NATIVE ## # fmt: off ALGHAFA_SUBSETS = [ "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", - "multiple_choice_sentiment_task", # "multiple_choice_openbookqa_translated_task", "multiple_choice_copa_translated_task" ### TODO : clean up this later ! + "multiple_choice_sentiment_task" ] # fmt: on @@ -260,7 +261,280 @@ def Alghafa(line, task_name: str = None): ) -_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task] +## ALGHAFA TRANSLATED ## +# race_ar ## +race_ar_task = LightevalTaskConfig( + name="race_ar", + prompt_function="Alghafa", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="race_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + + +# piqa_ar ## +piqa_ar_task = LightevalTaskConfig( + name="piqa_ar", + prompt_function="Alghafa", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="piqa_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + + +# arc_easy_ar ## +arc_easy_ar_task = LightevalTaskConfig( + name="arc_easy_ar", + prompt_function="Alghafa", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="arc_easy_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + + +# arc_challenge_okapi_ar ## +arc_challenge_okapi_ar_task = LightevalTaskConfig( + name="arc_challenge_okapi_ar", + prompt_function="Alghafa", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="arc_challenge_okapi_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + + +# mmlu_okapi_ar ## +mmlu_okapi_ar_task = LightevalTaskConfig( + name="mmlu_okapi_ar", + prompt_function="Alghafa", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="mmlu_okapi_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + + +# openbook_qa_ext_ar ## +openbook_qa_ext_ar_task = LightevalTaskConfig( + name="openbook_qa_ext_ar", + prompt_function="Alghafa", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="openbook_qa_ext_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + + +# boolq_ar ## +boolq_ar_task = LightevalTaskConfig( + name="boolq_ar", + prompt_function="boolq_function", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="boolq_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + +def boolq_function(line, task_name: str = None): + question = line["question"] + passage = line["passage"] + answer = "نعم" if line["answer"] else "لا" + + query = "بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ \"نعم\" أو \"لا\":\n{}\nالإجابة:".format(passage, question) + + return Doc( + task_name=task_name, + query=query, + choices=["نعم", "لا"], + gold_index=0 if line["answer"] else 1, + instruction="", + target_for_fewshot_sorting=answer, + ) + + +# copa_ext_ar ## +copa_ext_ar_task = LightevalTaskConfig( + name="copa_ext_ar", + prompt_function="copa_function", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="copa_ext_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + +def copa_function(line, task_name: str = None): + premise = line["premise"] + choices = [line["choice1"], line["choice2"]] + question_map = {"cause": "لأن", "effect": "لذلك"} + question = question_map[line["question"]] + answer = line["label"] + + query = "{}، {} :\n0) {}\n1) {}\nالإجابة:".format(premise, question, choices[0], choices[1]) + + return Doc( + task_name=task_name, + query=query, + choices=choices, + gold_index=answer, + instruction="", + target_for_fewshot_sorting=choices[answer], + ) + + +# hellaswag_okapi_ar ## +hellaswag_okapi_ar_task = LightevalTaskConfig( + name="hellaswag_okapi_ar", + prompt_function="hellaswag_function", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="hellaswag_okapi_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + +def hellaswag_function(line, task_name: str = None): + ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets + endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])] # endings is a string representation of a list + answer_index = line["label"] + + query = "بناءً على السياق التالي:\n{}\n اختر النهاية الصحيحة من الاقتراحات التالية:\n".format(ctx) + for i, ending in enumerate(endings): + query += "{}) {}\n".format(i, ending) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=endings, + gold_index=answer_index, + instruction="", + target_for_fewshot_sorting=endings[answer_index], + ) + + +toxigen_ar_task = LightevalTaskConfig( + name="toxigen_ar", + prompt_function="toxigen_function", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="toxigen_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + +def toxigen_function(line, task_name: str = None): + text = line["text"] + label = 1 if ((line['toxicity_ai'] + line['toxicity_human']) > 5.5) else 0 + query = f"هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\".\n'{text}'\nالإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=["لا", "نعم"], + gold_index=label, + instruction="", + target_for_fewshot_sorting="نعم" if label == 1 else "لا", + ) + + +sciq_ar_task = LightevalTaskConfig( + name="sciq_ar", + prompt_function="sciq_function", + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="sciq_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + # metric=["loglikelihood_acc"], + trust_dataset=True, +) + +def sciq_function(line, task_name: str = None): + support = line["support"] + question = line["question"] + choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]] + answer_index = 3 # The label is always 3 for the correct answer + + query = "بناءً على السياق التالي:\n{}\n اختر الإجابة الصحيحة من الاقتراحات التالية:\n".format(support) + for i, choice in enumerate(choices): + query += "{}) {}\n".format(i, choice) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=choices, + gold_index=answer_index, + instruction="", + target_for_fewshot_sorting=choices[answer_index], + ) + + +_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task] + [race_ar_task] + [piqa_ar_task] + [arc_easy_ar_task] + [arc_challenge_okapi_ar_task] + [mmlu_okapi_ar_task] + [openbook_qa_ext_ar_task] + [boolq_ar_task] + [copa_ext_ar_task] + [hellaswag_okapi_ar_task] + [toxigen_ar_task] + [sciq_ar_task] # Convert to dict for lighteval TASKS_TABLE = [task.as_dict() for task in _TASKS] From 9bb4da0ed59cffbc81293993c4c5f9f54eb8ff93 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:57:15 +0100 Subject: [PATCH 07/27] Update arabic_evals.py minor fixes flagged by the pre-commit hook --- community_tasks/arabic_evals.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 72667d26d..5c9f636ea 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -200,7 +200,7 @@ def arabic_exams(line, task_name: str = None): ) -## ALGHAFA NATIVE ## +# ALGHAFA NATIVE ## # fmt: off ALGHAFA_SUBSETS = [ "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", @@ -261,8 +261,8 @@ def Alghafa(line, task_name: str = None): ) -## ALGHAFA TRANSLATED ## -# race_ar ## +# ALGHAFA TRANSLATED ## +# race_ar race_ar_task = LightevalTaskConfig( name="race_ar", prompt_function="Alghafa", @@ -279,7 +279,7 @@ def Alghafa(line, task_name: str = None): ) -# piqa_ar ## +# piqa_ar piqa_ar_task = LightevalTaskConfig( name="piqa_ar", prompt_function="Alghafa", @@ -296,7 +296,7 @@ def Alghafa(line, task_name: str = None): ) -# arc_easy_ar ## +# arc_easy_ar arc_easy_ar_task = LightevalTaskConfig( name="arc_easy_ar", prompt_function="Alghafa", @@ -313,7 +313,7 @@ def Alghafa(line, task_name: str = None): ) -# arc_challenge_okapi_ar ## +# arc_challenge_okapi_ar arc_challenge_okapi_ar_task = LightevalTaskConfig( name="arc_challenge_okapi_ar", prompt_function="Alghafa", @@ -330,7 +330,7 @@ def Alghafa(line, task_name: str = None): ) -# mmlu_okapi_ar ## +# mmlu_okapi_ar mmlu_okapi_ar_task = LightevalTaskConfig( name="mmlu_okapi_ar", prompt_function="Alghafa", @@ -347,7 +347,7 @@ def Alghafa(line, task_name: str = None): ) -# openbook_qa_ext_ar ## +# openbook_qa_ext_ar openbook_qa_ext_ar_task = LightevalTaskConfig( name="openbook_qa_ext_ar", prompt_function="Alghafa", @@ -364,7 +364,7 @@ def Alghafa(line, task_name: str = None): ) -# boolq_ar ## +# boolq_ar boolq_ar_task = LightevalTaskConfig( name="boolq_ar", prompt_function="boolq_function", @@ -397,7 +397,7 @@ def boolq_function(line, task_name: str = None): ) -# copa_ext_ar ## +# copa_ext_ar copa_ext_ar_task = LightevalTaskConfig( name="copa_ext_ar", prompt_function="copa_function", @@ -432,7 +432,7 @@ def copa_function(line, task_name: str = None): ) -# hellaswag_okapi_ar ## +# hellaswag_okapi_ar hellaswag_okapi_ar_task = LightevalTaskConfig( name="hellaswag_okapi_ar", prompt_function="hellaswag_function", @@ -468,6 +468,7 @@ def hellaswag_function(line, task_name: str = None): ) +# toxigen_ar toxigen_ar_task = LightevalTaskConfig( name="toxigen_ar", prompt_function="toxigen_function", @@ -498,6 +499,7 @@ def toxigen_function(line, task_name: str = None): ) +# sciq_ar sciq_ar_task = LightevalTaskConfig( name="sciq_ar", prompt_function="sciq_function", @@ -519,10 +521,10 @@ def sciq_function(line, task_name: str = None): choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]] answer_index = 3 # The label is always 3 for the correct answer - query = "بناءً على السياق التالي:\n{}\n اختر الإجابة الصحيحة من الاقتراحات التالية:\n".format(support) + query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format(support, question) for i, choice in enumerate(choices): - query += "{}) {}\n".format(i, choice) - query += "الإجابة:" + query += "\n{}) {}".format(i, choice) + query += "\nالإجابة:" return Doc( task_name=task_name, From 3298ab5cc98529a0510fbd68b24d0302fbbf576f Mon Sep 17 00:00:00 2001 From: alielfilali01 Date: Fri, 8 Mar 2024 15:59:04 +0000 Subject: [PATCH 08/27] fix checks --- auto_commit_fixes.sh | 14 ++++++++++++++ tests/__init__.py | 1 - 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh new file mode 100755 index 000000000..dacc45864 --- /dev/null +++ b/auto_commit_fixes.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Run pre-commit on all files +pre-commit run --all-files + +# Check if there are changes that need to be staged and committed +if ! git diff --quiet; then + echo "Fixing inconsistencies and committing..." + git add . + git commit -m "fix checks" + git push origin main +else + echo "No changes detected." +fi diff --git a/tests/__init__.py b/tests/__init__.py index 04980c23f..a732db8d0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -19,4 +19,3 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. - From ad1ee556c587d9c013b77469eda4708c726a7b7a Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 8 Mar 2024 17:13:47 +0100 Subject: [PATCH 09/27] Update OALL_tasks.txt forgot to remove `community|Alghafa:multiple_choice_copa_translated_task|5|1` & `community|Alghafa:multiple_choice_openbookqa_translated_task|5|1` from ALGHAFA NATIVE --- tasks_examples/OALL_tasks.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt index 549039e18..e86fbae1c 100644 --- a/tasks_examples/OALL_tasks.txt +++ b/tasks_examples/OALL_tasks.txt @@ -118,11 +118,9 @@ community|acva:entertainment|5|1 community|Alghafa:mcq_exams_test_ar|5|1 community|Alghafa:meta_ar_dialects|5|1 community|Alghafa:meta_ar_msa|5|1 -community|Alghafa:multiple_choice_copa_translated_task|5|1 community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1 community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|Alghafa:multiple_choice_openbookqa_translated_task|5|1 community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 community|Alghafa:multiple_choice_rating_sentiment_task|5|1 community|Alghafa:multiple_choice_sentiment_task|5|1 From a9a25377a01c56b10942e9f077062683ff0a18ed Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 8 Mar 2024 17:14:12 +0100 Subject: [PATCH 10/27] Update all_arabic_tasks.txt forgot to remove `community|Alghafa:multiple_choice_copa_translated_task|5|1` & `community|Alghafa:multiple_choice_openbookqa_translated_task|5|1` from ALGHAFA NATIVE --- tasks_examples/all_arabic_tasks.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tasks_examples/all_arabic_tasks.txt b/tasks_examples/all_arabic_tasks.txt index 578f934a5..2856a34a4 100644 --- a/tasks_examples/all_arabic_tasks.txt +++ b/tasks_examples/all_arabic_tasks.txt @@ -118,11 +118,9 @@ community|acva:entertainment|5|1 community|Alghafa:mcq_exams_test_ar|5|1 community|Alghafa:meta_ar_dialects|5|1 community|Alghafa:meta_ar_msa|5|1 -community|Alghafa:multiple_choice_copa_translated_task|5|1 community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1 community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|Alghafa:multiple_choice_openbookqa_translated_task|5|1 community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 community|Alghafa:multiple_choice_rating_sentiment_task|5|1 community|Alghafa:multiple_choice_sentiment_task|5|1 From 55e27bbbdca38696e5647d5d970fe3aa7c4389d8 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 8 Mar 2024 17:37:25 +0100 Subject: [PATCH 11/27] Delete auto_commit_fixes.sh no need --- auto_commit_fixes.sh | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh deleted file mode 100755 index dacc45864..000000000 --- a/auto_commit_fixes.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# Run pre-commit on all files -pre-commit run --all-files - -# Check if there are changes that need to be staged and committed -if ! git diff --quiet; then - echo "Fixing inconsistencies and committing..." - git add . - git commit -m "fix checks" - git push origin main -else - echo "No changes detected." -fi From fea2cec365138782edab5eb30a06b3f387405fdc Mon Sep 17 00:00:00 2001 From: alielfilali01 Date: Fri, 8 Mar 2024 17:43:05 +0000 Subject: [PATCH 12/27] fix checks --- auto_commit_fixes.sh | 17 +++++++++ community_tasks/arabic_evals.py | 66 +++++++++++++++++++++++---------- 2 files changed, 63 insertions(+), 20 deletions(-) create mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh new file mode 100755 index 000000000..c4f93e9ba --- /dev/null +++ b/auto_commit_fixes.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Run pre-commit on all files +pre-commit run --all-files + +# Run make style as suggested by Clémentine +make style + +# Check if there are changes that need to be staged and committed +if ! git diff --quiet; then + echo "Fixing inconsistencies and committing..." + git add . + git commit -m "fix checks" + git push origin main +else + echo "No changes detected." +fi diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 5c9f636ea..2e2538dfc 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -26,10 +26,11 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ +import re + from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES -import re # fmt: off @@ -203,8 +204,8 @@ def arabic_exams(line, task_name: str = None): # ALGHAFA NATIVE ## # fmt: off ALGHAFA_SUBSETS = [ - "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", - "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", + "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", + "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", "multiple_choice_sentiment_task" ] # fmt: on @@ -262,7 +263,7 @@ def Alghafa(line, task_name: str = None): # ALGHAFA TRANSLATED ## -# race_ar +# race_ar race_ar_task = LightevalTaskConfig( name="race_ar", prompt_function="Alghafa", @@ -279,7 +280,7 @@ def Alghafa(line, task_name: str = None): ) -# piqa_ar +# piqa_ar piqa_ar_task = LightevalTaskConfig( name="piqa_ar", prompt_function="Alghafa", @@ -296,7 +297,7 @@ def Alghafa(line, task_name: str = None): ) -# arc_easy_ar +# arc_easy_ar arc_easy_ar_task = LightevalTaskConfig( name="arc_easy_ar", prompt_function="Alghafa", @@ -330,7 +331,7 @@ def Alghafa(line, task_name: str = None): ) -# mmlu_okapi_ar +# mmlu_okapi_ar mmlu_okapi_ar_task = LightevalTaskConfig( name="mmlu_okapi_ar", prompt_function="Alghafa", @@ -347,7 +348,7 @@ def Alghafa(line, task_name: str = None): ) -# openbook_qa_ext_ar +# openbook_qa_ext_ar openbook_qa_ext_ar_task = LightevalTaskConfig( name="openbook_qa_ext_ar", prompt_function="Alghafa", @@ -364,7 +365,7 @@ def Alghafa(line, task_name: str = None): ) -# boolq_ar +# boolq_ar boolq_ar_task = LightevalTaskConfig( name="boolq_ar", prompt_function="boolq_function", @@ -380,13 +381,14 @@ def Alghafa(line, task_name: str = None): trust_dataset=True, ) + def boolq_function(line, task_name: str = None): question = line["question"] passage = line["passage"] answer = "نعم" if line["answer"] else "لا" - query = "بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ \"نعم\" أو \"لا\":\n{}\nالإجابة:".format(passage, question) - + query = 'بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ "نعم" أو "لا":\n{}\nالإجابة:'.format(passage, question) + return Doc( task_name=task_name, query=query, @@ -397,7 +399,7 @@ def boolq_function(line, task_name: str = None): ) -# copa_ext_ar +# copa_ext_ar copa_ext_ar_task = LightevalTaskConfig( name="copa_ext_ar", prompt_function="copa_function", @@ -413,6 +415,7 @@ def boolq_function(line, task_name: str = None): trust_dataset=True, ) + def copa_function(line, task_name: str = None): premise = line["premise"] choices = [line["choice1"], line["choice2"]] @@ -421,7 +424,7 @@ def copa_function(line, task_name: str = None): answer = line["label"] query = "{}، {} :\n0) {}\n1) {}\nالإجابة:".format(premise, question, choices[0], choices[1]) - + return Doc( task_name=task_name, query=query, @@ -432,7 +435,7 @@ def copa_function(line, task_name: str = None): ) -# hellaswag_okapi_ar +# hellaswag_okapi_ar hellaswag_okapi_ar_task = LightevalTaskConfig( name="hellaswag_okapi_ar", prompt_function="hellaswag_function", @@ -448,9 +451,12 @@ def copa_function(line, task_name: str = None): trust_dataset=True, ) + def hellaswag_function(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets - endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])] # endings is a string representation of a list + endings = [ + re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"]) + ] # endings is a string representation of a list answer_index = line["label"] query = "بناءً على السياق التالي:\n{}\n اختر النهاية الصحيحة من الاقتراحات التالية:\n".format(ctx) @@ -468,7 +474,7 @@ def hellaswag_function(line, task_name: str = None): ) -# toxigen_ar +# toxigen_ar toxigen_ar_task = LightevalTaskConfig( name="toxigen_ar", prompt_function="toxigen_function", @@ -484,10 +490,11 @@ def hellaswag_function(line, task_name: str = None): trust_dataset=True, ) + def toxigen_function(line, task_name: str = None): text = line["text"] - label = 1 if ((line['toxicity_ai'] + line['toxicity_human']) > 5.5) else 0 - query = f"هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\".\n'{text}'\nالإجابة:" + label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 + query = f'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".\n\'{text}\'\nالإجابة:' return Doc( task_name=task_name, @@ -515,13 +522,16 @@ def toxigen_function(line, task_name: str = None): trust_dataset=True, ) + def sciq_function(line, task_name: str = None): support = line["support"] question = line["question"] choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]] answer_index = 3 # The label is always 3 for the correct answer - query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format(support, question) + query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format( + support, question + ) for i, choice in enumerate(choices): query += "\n{}) {}".format(i, choice) query += "\nالإجابة:" @@ -536,7 +546,23 @@ def sciq_function(line, task_name: str = None): ) -_TASKS = ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS + [arabic_exams_task] + [race_ar_task] + [piqa_ar_task] + [arc_easy_ar_task] + [arc_challenge_okapi_ar_task] + [mmlu_okapi_ar_task] + [openbook_qa_ext_ar_task] + [boolq_ar_task] + [copa_ext_ar_task] + [hellaswag_okapi_ar_task] + [toxigen_ar_task] + [sciq_ar_task] +_TASKS = ( + ARABIC_MMLU_TASKS + + ACVA_TASKS + + ALGHAFA_TASKS + + [arabic_exams_task] + + [race_ar_task] + + [piqa_ar_task] + + [arc_easy_ar_task] + + [arc_challenge_okapi_ar_task] + + [mmlu_okapi_ar_task] + + [openbook_qa_ext_ar_task] + + [boolq_ar_task] + + [copa_ext_ar_task] + + [hellaswag_okapi_ar_task] + + [toxigen_ar_task] + + [sciq_ar_task] +) # Convert to dict for lighteval TASKS_TABLE = [task.as_dict() for task in _TASKS] From d6646f90f7ca7f8ba2fcabe32fa37e2c7414f066 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:00:20 +0000 Subject: [PATCH 13/27] Delete auto_commit_fixes.sh --- auto_commit_fixes.sh | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh deleted file mode 100755 index c4f93e9ba..000000000 --- a/auto_commit_fixes.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# Run pre-commit on all files -pre-commit run --all-files - -# Run make style as suggested by Clémentine -make style - -# Check if there are changes that need to be staged and committed -if ! git diff --quiet; then - echo "Fixing inconsistencies and committing..." - git add . - git commit -m "fix checks" - git push origin main -else - echo "No changes detected." -fi From 306a5f3528fc2c6d587b1d05635e61b67ba8dcea Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:49:08 +0000 Subject: [PATCH 14/27] Update arabic_evals.py homogeneize naming according to the following comments : #### Prompt names such as boolq_function will be unclear long term. For such functions, you could either use boolq_prompt_arabic or just boolq_arabic. (You need to specify the language since there is already a boolq prompt function by default.) You also need to homogeneize Alghafa, which exists with several different casings, and fit it to Python style casing. For the prompt fonction, I'd keep it as alghafa_prompt or alghafa, for the class, CustomAlGhafaTask, and here for the name I'd keep it lower case [CustomAlGhafaTask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] #### --- community_tasks/arabic_evals.py | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 2e2538dfc..710477be9 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -220,7 +220,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function="Alghafa", + prompt_function="alghafa_prompt", hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", metric=["loglikelihood_acc_norm"], # metric=["loglikelihood_acc"], @@ -236,10 +236,10 @@ def __init__( ) -ALGHAFA_TASKS = [CustomALGHAFATask(name=f"Alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] +ALGHAFA_TASKS = [CustomALGHAFATask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] -def Alghafa(line, task_name: str = None): +def alghafa_prompt(line, task_name: str = None): question = line["query"] answer_index = int(line["label"]) # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' @@ -266,7 +266,7 @@ def Alghafa(line, task_name: str = None): # race_ar race_ar_task = LightevalTaskConfig( name="race_ar", - prompt_function="Alghafa", + prompt_function="alghafa_prompt", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="race_ar", @@ -283,7 +283,7 @@ def Alghafa(line, task_name: str = None): # piqa_ar piqa_ar_task = LightevalTaskConfig( name="piqa_ar", - prompt_function="Alghafa", + prompt_function="alghafa_prompt", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="piqa_ar", @@ -300,7 +300,7 @@ def Alghafa(line, task_name: str = None): # arc_easy_ar arc_easy_ar_task = LightevalTaskConfig( name="arc_easy_ar", - prompt_function="Alghafa", + prompt_function="alghafa_prompt", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_easy_ar", @@ -317,7 +317,7 @@ def Alghafa(line, task_name: str = None): # arc_challenge_okapi_ar arc_challenge_okapi_ar_task = LightevalTaskConfig( name="arc_challenge_okapi_ar", - prompt_function="Alghafa", + prompt_function="alghafa_prompt", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_challenge_okapi_ar", @@ -334,7 +334,7 @@ def Alghafa(line, task_name: str = None): # mmlu_okapi_ar mmlu_okapi_ar_task = LightevalTaskConfig( name="mmlu_okapi_ar", - prompt_function="Alghafa", + prompt_function="alghafa_prompt", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="mmlu_okapi_ar", @@ -351,7 +351,7 @@ def Alghafa(line, task_name: str = None): # openbook_qa_ext_ar openbook_qa_ext_ar_task = LightevalTaskConfig( name="openbook_qa_ext_ar", - prompt_function="Alghafa", + prompt_function="alghafa_prompt", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="openbook_qa_ext_ar", @@ -368,7 +368,7 @@ def Alghafa(line, task_name: str = None): # boolq_ar boolq_ar_task = LightevalTaskConfig( name="boolq_ar", - prompt_function="boolq_function", + prompt_function="boolq_prompt_arabic", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="boolq_ar", @@ -382,7 +382,7 @@ def Alghafa(line, task_name: str = None): ) -def boolq_function(line, task_name: str = None): +def boolq_prompt_arabic(line, task_name: str = None): question = line["question"] passage = line["passage"] answer = "نعم" if line["answer"] else "لا" @@ -402,7 +402,7 @@ def boolq_function(line, task_name: str = None): # copa_ext_ar copa_ext_ar_task = LightevalTaskConfig( name="copa_ext_ar", - prompt_function="copa_function", + prompt_function="copa_prompt_arabic", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="copa_ext_ar", @@ -416,7 +416,7 @@ def boolq_function(line, task_name: str = None): ) -def copa_function(line, task_name: str = None): +def copa_prompt_arabic(line, task_name: str = None): premise = line["premise"] choices = [line["choice1"], line["choice2"]] question_map = {"cause": "لأن", "effect": "لذلك"} @@ -438,7 +438,7 @@ def copa_function(line, task_name: str = None): # hellaswag_okapi_ar hellaswag_okapi_ar_task = LightevalTaskConfig( name="hellaswag_okapi_ar", - prompt_function="hellaswag_function", + prompt_function="hellaswag_prompt_arabic", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="hellaswag_okapi_ar", @@ -452,7 +452,7 @@ def copa_function(line, task_name: str = None): ) -def hellaswag_function(line, task_name: str = None): +def hellaswag_prompt_arabic(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets endings = [ re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"]) @@ -477,7 +477,7 @@ def hellaswag_function(line, task_name: str = None): # toxigen_ar toxigen_ar_task = LightevalTaskConfig( name="toxigen_ar", - prompt_function="toxigen_function", + prompt_function="toxigen_prompt_arabic", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="toxigen_ar", @@ -491,7 +491,7 @@ def hellaswag_function(line, task_name: str = None): ) -def toxigen_function(line, task_name: str = None): +def toxigen_prompt_arabic(line, task_name: str = None): text = line["text"] label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 query = f'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".\n\'{text}\'\nالإجابة:' @@ -509,7 +509,7 @@ def toxigen_function(line, task_name: str = None): # sciq_ar sciq_ar_task = LightevalTaskConfig( name="sciq_ar", - prompt_function="sciq_function", + prompt_function="sciq_prompt_arabic", suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="sciq_ar", @@ -523,7 +523,7 @@ def toxigen_function(line, task_name: str = None): ) -def sciq_function(line, task_name: str = None): +def sciq_prompt_arabic(line, task_name: str = None): support = line["support"] question = line["question"] choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]] From 2fbad52dcca7104164e9c16e05572245e11da070 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:49:54 +0000 Subject: [PATCH 15/27] Update OALL_tasks.txt homogeneize AlGhafa naming : `Alghafa` to `alghafa` --- tasks_examples/OALL_tasks.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt index e86fbae1c..346d062c6 100644 --- a/tasks_examples/OALL_tasks.txt +++ b/tasks_examples/OALL_tasks.txt @@ -115,15 +115,15 @@ community|acva:communication|5|1 community|acva:computer_and_phone|5|1 community|acva:daily_life|5|1 community|acva:entertainment|5|1 -community|Alghafa:mcq_exams_test_ar|5|1 -community|Alghafa:meta_ar_dialects|5|1 -community|Alghafa:meta_ar_msa|5|1 -community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 -community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1 -community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 -community|Alghafa:multiple_choice_rating_sentiment_task|5|1 -community|Alghafa:multiple_choice_sentiment_task|5|1 +community|alghafa:mcq_exams_test_ar|5|1 +community|alghafa:meta_ar_dialects|5|1 +community|alghafa:meta_ar_msa|5|1 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 +community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 +community|alghafa:multiple_choice_rating_sentiment_task|5|1 +community|alghafa:multiple_choice_sentiment_task|5|1 community|race_ar|5|1 community|piqa_ar|5|1 community|arc_easy_ar|5|1 From f3724035399e8e6bbe53f12f72c8ebe93cc95079 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:50:08 +0000 Subject: [PATCH 16/27] Update all_arabic_tasks.txt homogeneize AlGhafa naming : `Alghafa` to `alghafa` --- tasks_examples/all_arabic_tasks.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tasks_examples/all_arabic_tasks.txt b/tasks_examples/all_arabic_tasks.txt index 2856a34a4..fa430ed14 100644 --- a/tasks_examples/all_arabic_tasks.txt +++ b/tasks_examples/all_arabic_tasks.txt @@ -115,15 +115,15 @@ community|acva:communication|5|1 community|acva:computer_and_phone|5|1 community|acva:daily_life|5|1 community|acva:entertainment|5|1 -community|Alghafa:mcq_exams_test_ar|5|1 -community|Alghafa:meta_ar_dialects|5|1 -community|Alghafa:meta_ar_msa|5|1 -community|Alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 -community|Alghafa:multiple_choice_grounded_statement_soqal_task|5|1 -community|Alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|Alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 -community|Alghafa:multiple_choice_rating_sentiment_task|5|1 -community|Alghafa:multiple_choice_sentiment_task|5|1 +community|alghafa:mcq_exams_test_ar|5|1 +community|alghafa:meta_ar_dialects|5|1 +community|alghafa:meta_ar_msa|5|1 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 +community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 +community|alghafa:multiple_choice_rating_sentiment_task|5|1 +community|alghafa:multiple_choice_sentiment_task|5|1 community|race_ar|5|1 community|piqa_ar|5|1 community|arc_easy_ar|5|1 From 2c492f6d6ebc42d9013f7620650783d99cc49cc4 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:57:37 +0000 Subject: [PATCH 17/27] Update community_tasks/arabic_evals.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/arabic_evals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 710477be9..1c2a25039 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -223,7 +223,6 @@ def __init__( prompt_function="alghafa_prompt", hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", From e7ddfb547319f02aae80232d504fc5eeb2a99b4b Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:57:44 +0000 Subject: [PATCH 18/27] Update community_tasks/arabic_evals.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/arabic_evals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 1c2a25039..664a622f7 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -173,7 +173,6 @@ def acva(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) From f7278c19729245eed17f3f00acdbebc377e93448 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:57:55 +0000 Subject: [PATCH 19/27] Update community_tasks/arabic_evals.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/arabic_evals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 664a622f7..f330d47b5 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -64,7 +64,6 @@ def __init__( prompt_function="mmlu_arabic", hf_repo="OALL/Arabic_MMLU", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split="dev", From f186ded6b94c980cea08e28aa27666b31b7b0404 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:58:03 +0000 Subject: [PATCH 20/27] Update community_tasks/arabic_evals.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/arabic_evals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index f330d47b5..d8af5db6c 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -131,7 +131,6 @@ def __init__( prompt_function="acva", hf_repo="OALL/ACVA", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", From 6be58045286ae3c447c74b38442a271ebbb51acc Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Thu, 14 Mar 2024 13:22:30 +0000 Subject: [PATCH 21/27] Update community_tasks/arabic_evals.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit use the standard camel casing for classes: (remove) class CustomALGHAFATask(LightevalTaskConfig): (add) class CustomAlGhafaTask(LightevalTaskConfig): Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/arabic_evals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index d8af5db6c..3bc61fbf7 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -208,7 +208,7 @@ def arabic_exams(line, task_name: str = None): # fmt: on -class CustomALGHAFATask(LightevalTaskConfig): +class CustomAlGhafaTask(LightevalTaskConfig): def __init__( self, name, From e65e026785dd410c47850f507cbbe179cfaf1d4b Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Thu, 14 Mar 2024 14:05:46 +0000 Subject: [PATCH 22/27] Update arabic_evals.py Fixes based on Clementine's comments --- community_tasks/arabic_evals.py | 95 +++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 34 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 3bc61fbf7..e456e2b45 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -27,6 +27,7 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ import re +import random from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -208,7 +209,7 @@ def arabic_exams(line, task_name: str = None): # fmt: on -class CustomAlGhafaTask(LightevalTaskConfig): +class CustomAlGhafaNativeTask(LightevalTaskConfig): def __init__( self, name, @@ -232,7 +233,7 @@ def __init__( ) -ALGHAFA_TASKS = [CustomALGHAFATask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] +ALGHAFA_TASKS = [CustomAlGhafaNativeTask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] def alghafa_prompt(line, task_name: str = None): @@ -271,7 +272,6 @@ def alghafa_prompt(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -288,7 +288,6 @@ def alghafa_prompt(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -305,7 +304,6 @@ def alghafa_prompt(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -322,7 +320,6 @@ def alghafa_prompt(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -339,7 +336,6 @@ def alghafa_prompt(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -356,7 +352,6 @@ def alghafa_prompt(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -373,7 +368,6 @@ def alghafa_prompt(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -382,15 +376,22 @@ def boolq_prompt_arabic(line, task_name: str = None): question = line["question"] passage = line["passage"] answer = "نعم" if line["answer"] else "لا" - - query = 'بناءً على المقطع التالي:\n{}\n أجب عن هذا السؤال بـ "نعم" أو "لا":\n{}\nالإجابة:'.format(passage, question) + instruction = "بناء على المقطع التالي، أجب عن السؤال ب \"نعم\" أو \"لا\"" + query = f""" + {instruction} + المقطع : + {passage} + السؤال: + {question} + الإجابة: + """ return Doc( task_name=task_name, query=query, choices=["نعم", "لا"], gold_index=0 if line["answer"] else 1, - instruction="", + instruction=instruction, target_for_fewshot_sorting=answer, ) @@ -407,7 +408,6 @@ def boolq_prompt_arabic(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -443,21 +443,25 @@ def copa_prompt_arabic(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) def hellaswag_prompt_arabic(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets - endings = [ - re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"]) - ] # endings is a string representation of a list + endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])] # endings is a string representation of a list answer_index = line["label"] - - query = "بناءً على السياق التالي:\n{}\n اختر النهاية الصحيحة من الاقتراحات التالية:\n".format(ctx) + instruction = "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية" + + query = f""" + {instruction} + السياق: + {ctx} + الاقتراحات: + + """ for i, ending in enumerate(endings): - query += "{}) {}\n".format(i, ending) + query += f"{i}) {ending}\n" query += "الإجابة:" return Doc( @@ -465,7 +469,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): query=query, choices=endings, gold_index=answer_index, - instruction="", + instruction=instruction, target_for_fewshot_sorting=endings[answer_index], ) @@ -482,7 +486,6 @@ def hellaswag_prompt_arabic(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -490,14 +493,21 @@ def hellaswag_prompt_arabic(line, task_name: str = None): def toxigen_prompt_arabic(line, task_name: str = None): text = line["text"] label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 - query = f'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".\n\'{text}\'\nالإجابة:' + instruction = "هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\"." + + query = f""" + {instruction} + العبارة: + '{text}' + الإجابة: + """ return Doc( task_name=task_name, query=query, choices=["لا", "نعم"], gold_index=label, - instruction="", + instruction=instruction, target_for_fewshot_sorting="نعم" if label == 1 else "لا", ) @@ -514,7 +524,6 @@ def toxigen_prompt_arabic(line, task_name: str = None): few_shots_split="validation", few_shots_select="sequential", metric=["loglikelihood_acc_norm"], - # metric=["loglikelihood_acc"], trust_dataset=True, ) @@ -522,22 +531,40 @@ def toxigen_prompt_arabic(line, task_name: str = None): def sciq_prompt_arabic(line, task_name: str = None): support = line["support"] question = line["question"] - choices = [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]] - answer_index = 3 # The label is always 3 for the correct answer - - query = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال أدناه من قائمة الاقتراحات:\n\nالسياق:\n{}\n\nالسؤال:{}\n\nالإجابات المحتملة:".format( - support, question - ) + correct_answer = line["correct_answer"] + choices = [ + line["distractor1"], + line["distractor2"], + line["distractor3"], + correct_answer + ] + + # Shuffle the choices + random.shuffle(choices) + + answer_index = choices.index(correct_answer) + + instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات" + + query = f""" + {instruction} + السياق: + {support} + السؤال: + {question} + الإجابات المحتملة: + + """ for i, choice in enumerate(choices): - query += "\n{}) {}".format(i, choice) - query += "\nالإجابة:" + query += f"{i}) {choice}\n" + query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=choices, gold_index=answer_index, - instruction="", + instruction=instruction, target_for_fewshot_sorting=choices[answer_index], ) From c0e1a3f38d3a62614aaf1b2949eab12f188dde5f Mon Sep 17 00:00:00 2001 From: alielfilali01 Date: Thu, 14 Mar 2024 14:14:49 +0000 Subject: [PATCH 23/27] fix checks --- auto_commit_fixes.sh | 17 +++++++++++++++++ community_tasks/arabic_evals.py | 27 ++++++++++++--------------- 2 files changed, 29 insertions(+), 15 deletions(-) create mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh new file mode 100755 index 000000000..c4f93e9ba --- /dev/null +++ b/auto_commit_fixes.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Run pre-commit on all files +pre-commit run --all-files + +# Run make style as suggested by Clémentine +make style + +# Check if there are changes that need to be staged and committed +if ! git diff --quiet; then + echo "Fixing inconsistencies and committing..." + git add . + git commit -m "fix checks" + git push origin main +else + echo "No changes detected." +fi diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index e456e2b45..d420476de 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -26,8 +26,8 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ -import re import random +import re from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -376,7 +376,7 @@ def boolq_prompt_arabic(line, task_name: str = None): question = line["question"] passage = line["passage"] answer = "نعم" if line["answer"] else "لا" - instruction = "بناء على المقطع التالي، أجب عن السؤال ب \"نعم\" أو \"لا\"" + instruction = 'بناء على المقطع التالي، أجب عن السؤال ب "نعم" أو "لا"' query = f""" {instruction} المقطع : @@ -449,7 +449,9 @@ def copa_prompt_arabic(line, task_name: str = None): def hellaswag_prompt_arabic(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets - endings = [re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"])] # endings is a string representation of a list + endings = [ + re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"]) + ] # endings is a string representation of a list answer_index = line["label"] instruction = "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية" @@ -458,7 +460,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): السياق: {ctx} الاقتراحات: - + """ for i, ending in enumerate(endings): query += f"{i}) {ending}\n" @@ -493,7 +495,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): def toxigen_prompt_arabic(line, task_name: str = None): text = line["text"] label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 - instruction = "هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ\"نعم\" أو \"لا\"." + instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".' query = f""" {instruction} @@ -532,18 +534,13 @@ def sciq_prompt_arabic(line, task_name: str = None): support = line["support"] question = line["question"] correct_answer = line["correct_answer"] - choices = [ - line["distractor1"], - line["distractor2"], - line["distractor3"], - correct_answer - ] - + choices = [line["distractor1"], line["distractor2"], line["distractor3"], correct_answer] + # Shuffle the choices random.shuffle(choices) - + answer_index = choices.index(correct_answer) - + instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات" query = f""" @@ -553,7 +550,7 @@ def sciq_prompt_arabic(line, task_name: str = None): السؤال: {question} الإجابات المحتملة: - + """ for i, choice in enumerate(choices): query += f"{i}) {choice}\n" From 294ac498b250242b5dae9e0b8b5e2b351e6efe1b Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Thu, 14 Mar 2024 14:18:37 +0000 Subject: [PATCH 24/27] Delete auto_commit_fixes.sh --- auto_commit_fixes.sh | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh deleted file mode 100755 index c4f93e9ba..000000000 --- a/auto_commit_fixes.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# Run pre-commit on all files -pre-commit run --all-files - -# Run make style as suggested by Clémentine -make style - -# Check if there are changes that need to be staged and committed -if ! git diff --quiet; then - echo "Fixing inconsistencies and committing..." - git add . - git commit -m "fix checks" - git push origin main -else - echo "No changes detected." -fi From 97bd39338335dd40efb633e405628e271643280d Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 26 Mar 2024 16:15:42 +0000 Subject: [PATCH 25/27] Update arabic_evals.py Fix ValueError: Prompt query --- community_tasks/arabic_evals.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index d420476de..4fe59c2d6 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -376,9 +376,8 @@ def boolq_prompt_arabic(line, task_name: str = None): question = line["question"] passage = line["passage"] answer = "نعم" if line["answer"] else "لا" - instruction = 'بناء على المقطع التالي، أجب عن السؤال ب "نعم" أو "لا"' - query = f""" - {instruction} + instruction = 'بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا' + query = f"""{instruction} المقطع : {passage} السؤال: @@ -455,8 +454,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): answer_index = line["label"] instruction = "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية" - query = f""" - {instruction} + query = f"""{instruction} السياق: {ctx} الاقتراحات: @@ -497,8 +495,7 @@ def toxigen_prompt_arabic(line, task_name: str = None): label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".' - query = f""" - {instruction} + query = f"""{instruction} العبارة: '{text}' الإجابة: @@ -543,8 +540,7 @@ def sciq_prompt_arabic(line, task_name: str = None): instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات" - query = f""" - {instruction} + query = f"""{instruction} السياق: {support} السؤال: From e8cf58dce3a907605937d9513f88fe33b310cd86 Mon Sep 17 00:00:00 2001 From: alielfilali01 Date: Tue, 26 Mar 2024 17:01:19 +0000 Subject: [PATCH 26/27] fix checks --- auto_commit_fixes.sh | 17 +++++++++++++++++ community_tasks/arabic_evals.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh new file mode 100755 index 000000000..c4f93e9ba --- /dev/null +++ b/auto_commit_fixes.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Run pre-commit on all files +pre-commit run --all-files + +# Run make style as suggested by Clémentine +make style + +# Check if there are changes that need to be staged and committed +if ! git diff --quiet; then + echo "Fixing inconsistencies and committing..." + git add . + git commit -m "fix checks" + git push origin main +else + echo "No changes detected." +fi diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 4fe59c2d6..60db04505 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -376,7 +376,7 @@ def boolq_prompt_arabic(line, task_name: str = None): question = line["question"] passage = line["passage"] answer = "نعم" if line["answer"] else "لا" - instruction = 'بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا' + instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا" query = f"""{instruction} المقطع : {passage} From be7f3b834c788273ac1ee23fd47349f3e8dfbe30 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:02:53 +0000 Subject: [PATCH 27/27] Delete auto_commit_fixes.sh --- auto_commit_fixes.sh | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100755 auto_commit_fixes.sh diff --git a/auto_commit_fixes.sh b/auto_commit_fixes.sh deleted file mode 100755 index c4f93e9ba..000000000 --- a/auto_commit_fixes.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# Run pre-commit on all files -pre-commit run --all-files - -# Run make style as suggested by Clémentine -make style - -# Check if there are changes that need to be staged and committed -if ! git diff --quiet; then - echo "Fixing inconsistencies and committing..." - git add . - git commit -m "fix checks" - git push origin main -else - echo "No changes detected." -fi