diff --git a/lm_eval/tasks/agieval/README.md b/lm_eval/tasks/agieval/README.md new file mode 100644 index 0000000000..faaf47b6be --- /dev/null +++ b/lm_eval/tasks/agieval/README.md @@ -0,0 +1,114 @@ +# AGIEval + +### Paper + +Title: AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models + +Abstract: https://arxiv.org/abs/2304.06364.pdf + +AGIEval is a human-centric benchmark specifically designed to evaluate the general abilities of foundation models in tasks pertinent to human cognition and problem-solving. +This benchmark is derived from 20 official, public, and high-standard admission and qualification exams intended for general human test-takers, such as general college admission tests (e.g., Chinese College Entrance Exam (Gaokao) and American SAT), law school admission tests, math competitions, lawyer qualification tests, and national civil service exams. + +Homepage: https://github.com/ruixiangcui/AGIEval + +### Citation + +``` +@misc{zhong2023agieval, + title={AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models}, + author={Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan}, + year={2023}, + eprint={2304.06364}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +Please make sure to cite all the individual datasets in your paper when you use them. We provide the relevant citation information below: + +``` +@inproceedings{ling-etal-2017-program, + title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems", + author = "Ling, Wang and + Yogatama, Dani and + Dyer, Chris and + Blunsom, Phil", + booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + month = jul, + year = "2017", + address = "Vancouver, Canada", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/P17-1015", + doi = "10.18653/v1/P17-1015", + pages = "158--167", + abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.", +} + +@inproceedings{hendrycksmath2021, + title={Measuring Mathematical Problem Solving With the MATH Dataset}, + author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +} + +@inproceedings{Liu2020LogiQAAC, + title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, + author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang}, + booktitle={International Joint Conference on Artificial Intelligence}, + year={2020} +} + +@inproceedings{zhong2019jec, + title={JEC-QA: A Legal-Domain Question Answering Dataset}, + author={Zhong, Haoxi and Xiao, Chaojun and Tu, Cunchao and Zhang, Tianyang and Liu, Zhiyuan and Sun, Maosong}, + booktitle={Proceedings of AAAI}, + year={2020}, +} + +@article{Wang2021FromLT, + title={From LSAT: The Progress and Challenges of Complex Reasoning}, + author={Siyuan Wang and Zhongkun Liu and Wanjun Zhong and Ming Zhou and Zhongyu Wei and Zhumin Chen and Nan Duan}, + journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, + year={2021}, + volume={30}, + pages={2201-2216} +} +``` + +### Groups and Tasks + +#### Groups + +- `agieval`: Evaluates all tasks listed below. + +- `agieval_en`: Evaluates all English subtasks: `agieval_aqua_rat`, `agieval_gaokao_english`, `agieval_logiqa_en`, `agieval_lsat_*`, `agieval_sat_*`, `agieval_math` + +- `agieval_cn`: Evaluates all Chinese subtasks: +`agieval_gaokao_biology`, `agieval_gaokao_chemistry`, `agieval_gaokao_chinese`, `agieval_gaokao_geography`, +`agieval_gaokao_history`, `agieval_gaokao_mathqa`, `agieval_gaokao_mathcloze`, `agieval_gaokao_physics`, `agieval_jec_qa_ca`, `agieval_jec_qa_kd`, `agieval_logiqa_zh` + +- `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md + +#### Tasks + +- `agieval_aqua_rat` +- `agieval_gaokao_biology` +- `agieval_gaokao_chemistry` +- `agieval_gaokao_chinese` +- `agieval_gaokao_english` +- `agieval_gaokao_geography` +- `agieval_gaokao_history` +- `agieval_gaokao_mathqa` +- `agieval_gaokao_mathcloze` +- `agieval_gaokao_physics` +- `agieval_jec_qa_ca` +- `agieval_jec_qa_kd` +- `agieval_logiqa_en` +- `agieval_logiqa_zh` +- `agieval_lsat_ar` +- `agieval_lsat_lr` +- `agieval_lsat_rc` +- `agieval_sat_en` +- `agieval_sat_en_without_passage` +- `agieval_sat_math` +- `agieval_math` diff --git a/lm_eval/tasks/agieval/aqua-rat.yaml b/lm_eval/tasks/agieval/aqua-rat.yaml new file mode 100644 index 0000000000..babebf638e --- /dev/null +++ b/lm_eval/tasks/agieval/aqua-rat.yaml @@ -0,0 +1,24 @@ +group: + - agieval + - agieval_en + - agieval_nous +task: agieval_aqua_rat +dataset_path: hails/agieval-aqua-rat +dataset_name: null +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "{{choices}}" +process_results: !function utils.process_results_mcqa +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/agieval/gaokao-biology.yaml b/lm_eval/tasks/agieval/gaokao-biology.yaml new file mode 100644 index 0000000000..36c44cbbee --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-biology.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_gaokao_biology +dataset_path: hails/agieval-gaokao-biology diff --git a/lm_eval/tasks/agieval/gaokao-chemistry.yaml b/lm_eval/tasks/agieval/gaokao-chemistry.yaml new file mode 100644 index 0000000000..69810122eb --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-chemistry.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_gaokao_chemistry +dataset_path: hails/agieval-gaokao-chemistry diff --git a/lm_eval/tasks/agieval/gaokao-chinese.yaml b/lm_eval/tasks/agieval/gaokao-chinese.yaml new file mode 100644 index 0000000000..30d249b9d5 --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-chinese.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_gaokao_chinese +dataset_path: hails/agieval-gaokao-chinese diff --git a/lm_eval/tasks/agieval/gaokao-english.yaml b/lm_eval/tasks/agieval/gaokao-english.yaml new file mode 100644 index 0000000000..a540fcf25f --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-english.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_en # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks` +task: agieval_gaokao_english +dataset_path: hails/agieval-gaokao-english diff --git a/lm_eval/tasks/agieval/gaokao-geography.yaml b/lm_eval/tasks/agieval/gaokao-geography.yaml new file mode 100644 index 0000000000..2fe43bfd2c --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-geography.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_gaokao_geography +dataset_path: hails/agieval-gaokao-geography diff --git a/lm_eval/tasks/agieval/gaokao-history.yaml b/lm_eval/tasks/agieval/gaokao-history.yaml new file mode 100644 index 0000000000..b9c9c630fa --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-history.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_gaokao_history +dataset_path: hails/agieval-gaokao-history diff --git a/lm_eval/tasks/agieval/gaokao-mathcloze.yaml b/lm_eval/tasks/agieval/gaokao-mathcloze.yaml new file mode 100644 index 0000000000..74cbad1c03 --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-mathcloze.yaml @@ -0,0 +1,25 @@ +group: + - agieval + - agieval_cn +task: agieval_gaokao_mathcloze +dataset_path: hails/agieval-gaokao-mathcloze +dataset_name: null +output_type: generate_until +training_split: null +validation_split: null +test_split: test +doc_to_text: "{{query}}" +doc_to_target: "{{answer}}" +process_results: !function utils.process_results +generation_kwargs: + max_gen_toks: 32 + do_sample: False + temperature: 0.0 + until: + - "Q:" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/agieval/gaokao-mathqa.yaml b/lm_eval/tasks/agieval/gaokao-mathqa.yaml new file mode 100644 index 0000000000..aa94e8eec8 --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-mathqa.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_gaokao_mathqa +dataset_path: hails/agieval-gaokao-mathqa diff --git a/lm_eval/tasks/agieval/gaokao-physics.yaml b/lm_eval/tasks/agieval/gaokao-physics.yaml new file mode 100644 index 0000000000..175dd6cca0 --- /dev/null +++ b/lm_eval/tasks/agieval/gaokao-physics.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_gaokao_physics +dataset_path: hails/agieval-gaokao-physics diff --git a/lm_eval/tasks/agieval/jec-qa-ca.yaml b/lm_eval/tasks/agieval/jec-qa-ca.yaml new file mode 100644 index 0000000000..f93b47a5b1 --- /dev/null +++ b/lm_eval/tasks/agieval/jec-qa-ca.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_jec_qa_ca +dataset_path: hails/agieval-jec-qa-ca diff --git a/lm_eval/tasks/agieval/jec-qa-kd.yaml b/lm_eval/tasks/agieval/jec-qa-kd.yaml new file mode 100644 index 0000000000..0458eb7ea8 --- /dev/null +++ b/lm_eval/tasks/agieval/jec-qa-kd.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_jec_qa_kd +dataset_path: hails/agieval-jec-qa-kd diff --git a/lm_eval/tasks/agieval/logiqa-en.yaml b/lm_eval/tasks/agieval/logiqa-en.yaml new file mode 100644 index 0000000000..7112418659 --- /dev/null +++ b/lm_eval/tasks/agieval/logiqa-en.yaml @@ -0,0 +1,7 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_nous + - agieval_en +task: agieval_logiqa_en +dataset_path: hails/agieval-logiqa-en diff --git a/lm_eval/tasks/agieval/logiqa-zh.yaml b/lm_eval/tasks/agieval/logiqa-zh.yaml new file mode 100644 index 0000000000..82e688006b --- /dev/null +++ b/lm_eval/tasks/agieval/logiqa-zh.yaml @@ -0,0 +1,6 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_cn +task: agieval_logiqa_zh +dataset_path: hails/agieval-logiqa-zh diff --git a/lm_eval/tasks/agieval/lsat-ar.yaml b/lm_eval/tasks/agieval/lsat-ar.yaml new file mode 100644 index 0000000000..302f9b519e --- /dev/null +++ b/lm_eval/tasks/agieval/lsat-ar.yaml @@ -0,0 +1,7 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_nous + - agieval_en +task: agieval_lsat_ar +dataset_path: hails/agieval-lsat-ar diff --git a/lm_eval/tasks/agieval/lsat-lr.yaml b/lm_eval/tasks/agieval/lsat-lr.yaml new file mode 100644 index 0000000000..62158e5cec --- /dev/null +++ b/lm_eval/tasks/agieval/lsat-lr.yaml @@ -0,0 +1,7 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_nous + - agieval_en +task: agieval_lsat_lr +dataset_path: hails/agieval-lsat-lr diff --git a/lm_eval/tasks/agieval/lsat-rc.yaml b/lm_eval/tasks/agieval/lsat-rc.yaml new file mode 100644 index 0000000000..de155af78a --- /dev/null +++ b/lm_eval/tasks/agieval/lsat-rc.yaml @@ -0,0 +1,7 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_nous + - agieval_en +task: agieval_lsat_rc +dataset_path: hails/agieval-lsat-rc diff --git a/lm_eval/tasks/agieval/math.yaml b/lm_eval/tasks/agieval/math.yaml new file mode 100644 index 0000000000..c8ec9eec60 --- /dev/null +++ b/lm_eval/tasks/agieval/math.yaml @@ -0,0 +1,25 @@ +group: + - agieval + - agieval_en +task: agieval_math +dataset_path: hails/agieval-math +dataset_name: null +output_type: generate_until +training_split: null +validation_split: null +test_split: test +doc_to_text: "{{query}}" +doc_to_target: "{{answer}}" +process_results: !function utils.process_results +generation_kwargs: + max_gen_toks: 32 + do_sample: False + temperature: 0.0 + until: + - "Q:" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/agieval/sat-en-without-passage.yaml b/lm_eval/tasks/agieval/sat-en-without-passage.yaml new file mode 100644 index 0000000000..01490d9ee1 --- /dev/null +++ b/lm_eval/tasks/agieval/sat-en-without-passage.yaml @@ -0,0 +1,7 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_nous + - agieval_en +task: agieval_sat_en_without_passage +dataset_path: hails/agieval-sat-en-without-passage diff --git a/lm_eval/tasks/agieval/sat-en.yaml b/lm_eval/tasks/agieval/sat-en.yaml new file mode 100644 index 0000000000..a45dba1507 --- /dev/null +++ b/lm_eval/tasks/agieval/sat-en.yaml @@ -0,0 +1,7 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_nous + - agieval_en +task: agieval_sat_en +dataset_path: hails/agieval-sat-en diff --git a/lm_eval/tasks/agieval/sat-math.yaml b/lm_eval/tasks/agieval/sat-math.yaml new file mode 100644 index 0000000000..f5b644ee06 --- /dev/null +++ b/lm_eval/tasks/agieval/sat-math.yaml @@ -0,0 +1,7 @@ +include: aqua-rat.yaml +group: + - agieval + - agieval_nous + - agieval_en +task: agieval_sat_math +dataset_path: hails/agieval-sat-math diff --git a/lm_eval/tasks/agieval/utils.py b/lm_eval/tasks/agieval/utils.py new file mode 100644 index 0000000000..8fcbdec1fe --- /dev/null +++ b/lm_eval/tasks/agieval/utils.py @@ -0,0 +1,275 @@ +# Answer parsing and normalization code, from +# https://github.com/ruixiangcui/AGIEval/blob/main/src/ +# math_equivalence.py and post_process.py +from typing import Dict, List + +import re + +import numpy as np + +def parse_math_answer(raw_string): + + def remove_boxed(s): + left = "\\boxed{" + try: + assert s[:len(left)] == left + assert s[-1] == "}" + answer = s[len(left):-1] + if "=" in answer: + answer = answer.split("=")[-1].lstrip(" ") + return answer + except: + return None + + def last_boxed_only_string(string): + idx = string.rfind("\\boxed") + if idx < 0: + idx = string.rfind("\\fbox") + if idx < 0: + return None + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == "{": + num_left_braces_open += 1 + if string[i] == "}": + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx == None: + retval = None + else: + retval = string[idx:right_brace_idx + 1] + + return retval + + def get_answer_with_dollar_sign(s): + first_pattern = "\$(.*)\$" + last_match = None + matches = re.findall(first_pattern, s) + if matches: + last_match = matches[-1] + if "=" in last_match: + last_match = last_match.split("=")[-1].lstrip(" ") + return last_match + + def get_answer_without_dollar_sign(s): + last_match = None + if "=" in s: + last_match = s.split("=")[-1].lstrip(" ").rstrip(".") + if "\\n" in last_match: + last_match = last_match.split("\\n")[0] + else: + pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])" + matches = re.findall(pattern, s) + if matches: + last_match = matches[-1] + return last_match + + if "\\boxed" in raw_string: + answer = remove_boxed(last_boxed_only_string(raw_string)) + else: + answer = get_answer_with_dollar_sign(raw_string) + if not answer: + answer = get_answer_without_dollar_sign(raw_string) + return answer + + +# code from https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py +def _fix_fracs(string): + substrs = string.split("\\frac") + new_str = substrs[0] + if len(substrs) > 1: + substrs = substrs[1:] + for substr in substrs: + new_str += "\\frac" + if substr[0] == "{": + new_str += substr + else: + try: + assert len(substr) >= 2 + except: + return string + a = substr[0] + b = substr[1] + if b != "{": + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}{" + b + "}" + post_substr + else: + new_str += "{" + a + "}{" + b + "}" + else: + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}" + b + post_substr + else: + new_str += "{" + a + "}" + b + string = new_str + return string + + +def _fix_a_slash_b(string): + if len(string.split("/")) != 2: + return string + a = string.split("/")[0] + b = string.split("/")[1] + try: + a = int(a) + b = int(b) + assert string == "{}/{}".format(a, b) + new_string = "\\frac{" + str(a) + "}{" + str(b) + "}" + return new_string + except: + return string + + +def _remove_right_units(string): + # "\\text{ " only ever occurs (at least in the val set) when describing units + if "\\text{ " in string: + splits = string.split("\\text{ ") + assert len(splits) == 2 + return splits[0] + else: + return string + + +def _fix_sqrt(string): + if "\\sqrt" not in string: + return string + splits = string.split("\\sqrt") + new_string = splits[0] + for split in splits[1:]: + if split[0] != "{": + a = split[0] + new_substr = "\\sqrt{" + a + "}" + split[1:] + else: + new_substr = "\\sqrt" + split + new_string += new_substr + return new_string + + +def _strip_string(string): + # linebreaks + string = string.replace("\n", "") + # print(string) + + # remove inverse spaces + string = string.replace("\\!", "") + # print(string) + + # replace \\ with \ + string = string.replace("\\\\", "\\") + # print(string) + + # replace tfrac and dfrac with frac + string = string.replace("tfrac", "frac") + string = string.replace("dfrac", "frac") + # print(string) + + # remove \left and \right + string = string.replace("\\left", "") + string = string.replace("\\right", "") + # print(string) + + # Remove circ (degrees) + string = string.replace("^{\\circ}", "") + string = string.replace("^\\circ", "") + + # remove dollar signs + string = string.replace("\\$", "") + + # remove units (on the right) + string = _remove_right_units(string) + + # remove percentage + string = string.replace("\\%", "") + string = string.replace("\%", "") + + # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string + string = string.replace(" .", " 0.") + string = string.replace("{.", "{0.") + # if empty, return empty string + if len(string) == 0: + return string + if string[0] == ".": + string = "0" + string + + # to consider: get rid of e.g. "k = " or "q = " at beginning + if len(string.split("=")) == 2: + if len(string.split("=")[0]) <= 2: + string = string.split("=")[1] + + # fix sqrt3 --> sqrt{3} + string = _fix_sqrt(string) + + # remove spaces + string = string.replace(" ", "") + + # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} + string = _fix_fracs(string) + + # manually change 0.5 --> \frac{1}{2} + if string == "0.5": + string = "\\frac{1}{2}" + + # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y + string = _fix_a_slash_b(string) + + return string + + +def is_equiv(str1, str2, verbose=False): + if str1 is None and str2 is None: + print("WARNING: Both None") + return True + if str1 is None or str2 is None: + return False + + str1, str2 = parse_math_answer(str1), parse_math_answer(str2) + + try: + ss1 = _strip_string(str1) + ss2 = _strip_string(str2) + if verbose: + print(ss1, ss2) + return ss1 == ss2 + except: + return str1 == str2 + + +def process_results(doc: dict, results: List[str]) -> Dict[str, int]: + candidate = results[0] + + gold = doc["answer"] + + if not gold: + print(doc, candidate, gold) + if is_equiv(candidate, gold): + retval = 1 + else: + retval = 0 + + results = { + "acc": retval, + } + return results + +# use a custom process_results() function, because AGIEval can have multiple valid answers +def process_results_mcqa(doc, results): + + results = [result[0] for result in results] + + gold = doc["gold"] + + acc = 1.0 if int(np.argmax(results)) in gold else 0.0 + completion_len = np.array([float(len(i)) for i in doc["choices"]]) + acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0 + + return { + "acc": acc, + "acc_norm": acc_norm, + }