diff --git a/lm_eval/tasks/agieval/README.md b/lm_eval/tasks/agieval/README.md
new file mode 100644
index 0000000000..faaf47b6be
--- /dev/null
+++ b/lm_eval/tasks/agieval/README.md
@@ -0,0 +1,114 @@
+# AGIEval
+
+### Paper
+
+Title: AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models
+
+Abstract: https://arxiv.org/abs/2304.06364.pdf
+
+AGIEval is a human-centric benchmark specifically designed to evaluate the general abilities of foundation models in tasks pertinent to human cognition and problem-solving.
+This benchmark is derived from 20 official, public, and high-standard admission and qualification exams intended for general human test-takers, such as general college admission tests (e.g., Chinese College Entrance Exam (Gaokao) and American SAT), law school admission tests, math competitions, lawyer qualification tests, and national civil service exams.
+
+Homepage: https://github.com/ruixiangcui/AGIEval
+
+### Citation
+
+```
+@misc{zhong2023agieval,
+      title={AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
+      author={Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
+      year={2023},
+      eprint={2304.06364},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+Please make sure to cite all the individual datasets in your paper when you use them. We provide the relevant citation information below:
+
+```
+@inproceedings{ling-etal-2017-program,
+    title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
+    author = "Ling, Wang  and
+      Yogatama, Dani  and
+      Dyer, Chris  and
+      Blunsom, Phil",
+    booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = jul,
+    year = "2017",
+    address = "Vancouver, Canada",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/P17-1015",
+    doi = "10.18653/v1/P17-1015",
+    pages = "158--167",
+    abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.",
+}
+
+@inproceedings{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+
+@inproceedings{Liu2020LogiQAAC,
+  title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
+  author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
+  booktitle={International Joint Conference on Artificial Intelligence},
+  year={2020}
+}
+
+@inproceedings{zhong2019jec,
+  title={JEC-QA: A Legal-Domain Question Answering Dataset},
+  author={Zhong, Haoxi and Xiao, Chaojun and Tu, Cunchao and Zhang, Tianyang and Liu, Zhiyuan and Sun, Maosong},
+  booktitle={Proceedings of AAAI},
+  year={2020},
+}
+
+@article{Wang2021FromLT,
+  title={From LSAT: The Progress and Challenges of Complex Reasoning},
+  author={Siyuan Wang and Zhongkun Liu and Wanjun Zhong and Ming Zhou and Zhongyu Wei and Zhumin Chen and Nan Duan},
+  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+  year={2021},
+  volume={30},
+  pages={2201-2216}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `agieval`: Evaluates all tasks listed below.
+
+- `agieval_en`: Evaluates all English subtasks: `agieval_aqua_rat`, `agieval_gaokao_english`, `agieval_logiqa_en`, `agieval_lsat_*`, `agieval_sat_*`, `agieval_math`
+
+- `agieval_cn`: Evaluates all Chinese subtasks:
+`agieval_gaokao_biology`, `agieval_gaokao_chemistry`, `agieval_gaokao_chinese`, `agieval_gaokao_geography`,
+`agieval_gaokao_history`, `agieval_gaokao_mathqa`, `agieval_gaokao_mathcloze`, `agieval_gaokao_physics`, `agieval_jec_qa_ca`, `agieval_jec_qa_kd`, `agieval_logiqa_zh`
+
+- `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md
+
+#### Tasks
+
+- `agieval_aqua_rat`
+- `agieval_gaokao_biology`
+- `agieval_gaokao_chemistry`
+- `agieval_gaokao_chinese`
+- `agieval_gaokao_english`
+- `agieval_gaokao_geography`
+- `agieval_gaokao_history`
+- `agieval_gaokao_mathqa`
+- `agieval_gaokao_mathcloze`
+- `agieval_gaokao_physics`
+- `agieval_jec_qa_ca`
+- `agieval_jec_qa_kd`
+- `agieval_logiqa_en`
+- `agieval_logiqa_zh`
+- `agieval_lsat_ar`
+- `agieval_lsat_lr`
+- `agieval_lsat_rc`
+- `agieval_sat_en`
+- `agieval_sat_en_without_passage`
+- `agieval_sat_math`
+- `agieval_math`
diff --git a/lm_eval/tasks/agieval/aqua-rat.yaml b/lm_eval/tasks/agieval/aqua-rat.yaml
new file mode 100644
index 0000000000..babebf638e
--- /dev/null
+++ b/lm_eval/tasks/agieval/aqua-rat.yaml
@@ -0,0 +1,24 @@
+group:
+  - agieval
+  - agieval_en
+  - agieval_nous
+task: agieval_aqua_rat
+dataset_path: hails/agieval-aqua-rat
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "{{choices}}"
+process_results: !function utils.process_results_mcqa
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/agieval/gaokao-biology.yaml b/lm_eval/tasks/agieval/gaokao-biology.yaml
new file mode 100644
index 0000000000..36c44cbbee
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-biology.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_biology
+dataset_path: hails/agieval-gaokao-biology
diff --git a/lm_eval/tasks/agieval/gaokao-chemistry.yaml b/lm_eval/tasks/agieval/gaokao-chemistry.yaml
new file mode 100644
index 0000000000..69810122eb
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-chemistry.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_chemistry
+dataset_path: hails/agieval-gaokao-chemistry
diff --git a/lm_eval/tasks/agieval/gaokao-chinese.yaml b/lm_eval/tasks/agieval/gaokao-chinese.yaml
new file mode 100644
index 0000000000..30d249b9d5
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-chinese.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_chinese
+dataset_path: hails/agieval-gaokao-chinese
diff --git a/lm_eval/tasks/agieval/gaokao-english.yaml b/lm_eval/tasks/agieval/gaokao-english.yaml
new file mode 100644
index 0000000000..a540fcf25f
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-english.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_en # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks`
+task: agieval_gaokao_english
+dataset_path: hails/agieval-gaokao-english
diff --git a/lm_eval/tasks/agieval/gaokao-geography.yaml b/lm_eval/tasks/agieval/gaokao-geography.yaml
new file mode 100644
index 0000000000..2fe43bfd2c
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-geography.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_geography
+dataset_path: hails/agieval-gaokao-geography
diff --git a/lm_eval/tasks/agieval/gaokao-history.yaml b/lm_eval/tasks/agieval/gaokao-history.yaml
new file mode 100644
index 0000000000..b9c9c630fa
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-history.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_history
+dataset_path: hails/agieval-gaokao-history
diff --git a/lm_eval/tasks/agieval/gaokao-mathcloze.yaml b/lm_eval/tasks/agieval/gaokao-mathcloze.yaml
new file mode 100644
index 0000000000..74cbad1c03
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-mathcloze.yaml
@@ -0,0 +1,25 @@
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_mathcloze
+dataset_path: hails/agieval-gaokao-mathcloze
+dataset_name: null
+output_type: generate_until
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "{{query}}"
+doc_to_target: "{{answer}}"
+process_results: !function utils.process_results
+generation_kwargs:
+  max_gen_toks: 32
+  do_sample: False
+  temperature: 0.0
+  until:
+    - "Q:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/agieval/gaokao-mathqa.yaml b/lm_eval/tasks/agieval/gaokao-mathqa.yaml
new file mode 100644
index 0000000000..aa94e8eec8
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-mathqa.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_mathqa
+dataset_path: hails/agieval-gaokao-mathqa
diff --git a/lm_eval/tasks/agieval/gaokao-physics.yaml b/lm_eval/tasks/agieval/gaokao-physics.yaml
new file mode 100644
index 0000000000..175dd6cca0
--- /dev/null
+++ b/lm_eval/tasks/agieval/gaokao-physics.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_physics
+dataset_path: hails/agieval-gaokao-physics
diff --git a/lm_eval/tasks/agieval/jec-qa-ca.yaml b/lm_eval/tasks/agieval/jec-qa-ca.yaml
new file mode 100644
index 0000000000..f93b47a5b1
--- /dev/null
+++ b/lm_eval/tasks/agieval/jec-qa-ca.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_jec_qa_ca
+dataset_path: hails/agieval-jec-qa-ca
diff --git a/lm_eval/tasks/agieval/jec-qa-kd.yaml b/lm_eval/tasks/agieval/jec-qa-kd.yaml
new file mode 100644
index 0000000000..0458eb7ea8
--- /dev/null
+++ b/lm_eval/tasks/agieval/jec-qa-kd.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_jec_qa_kd
+dataset_path: hails/agieval-jec-qa-kd
diff --git a/lm_eval/tasks/agieval/logiqa-en.yaml b/lm_eval/tasks/agieval/logiqa-en.yaml
new file mode 100644
index 0000000000..7112418659
--- /dev/null
+++ b/lm_eval/tasks/agieval/logiqa-en.yaml
@@ -0,0 +1,7 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_nous
+  - agieval_en
+task: agieval_logiqa_en
+dataset_path: hails/agieval-logiqa-en
diff --git a/lm_eval/tasks/agieval/logiqa-zh.yaml b/lm_eval/tasks/agieval/logiqa-zh.yaml
new file mode 100644
index 0000000000..82e688006b
--- /dev/null
+++ b/lm_eval/tasks/agieval/logiqa-zh.yaml
@@ -0,0 +1,6 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_logiqa_zh
+dataset_path: hails/agieval-logiqa-zh
diff --git a/lm_eval/tasks/agieval/lsat-ar.yaml b/lm_eval/tasks/agieval/lsat-ar.yaml
new file mode 100644
index 0000000000..302f9b519e
--- /dev/null
+++ b/lm_eval/tasks/agieval/lsat-ar.yaml
@@ -0,0 +1,7 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_nous
+  - agieval_en
+task: agieval_lsat_ar
+dataset_path: hails/agieval-lsat-ar
diff --git a/lm_eval/tasks/agieval/lsat-lr.yaml b/lm_eval/tasks/agieval/lsat-lr.yaml
new file mode 100644
index 0000000000..62158e5cec
--- /dev/null
+++ b/lm_eval/tasks/agieval/lsat-lr.yaml
@@ -0,0 +1,7 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_nous
+  - agieval_en
+task: agieval_lsat_lr
+dataset_path: hails/agieval-lsat-lr
diff --git a/lm_eval/tasks/agieval/lsat-rc.yaml b/lm_eval/tasks/agieval/lsat-rc.yaml
new file mode 100644
index 0000000000..de155af78a
--- /dev/null
+++ b/lm_eval/tasks/agieval/lsat-rc.yaml
@@ -0,0 +1,7 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_nous
+  - agieval_en
+task: agieval_lsat_rc
+dataset_path: hails/agieval-lsat-rc
diff --git a/lm_eval/tasks/agieval/math.yaml b/lm_eval/tasks/agieval/math.yaml
new file mode 100644
index 0000000000..c8ec9eec60
--- /dev/null
+++ b/lm_eval/tasks/agieval/math.yaml
@@ -0,0 +1,25 @@
+group:
+  - agieval
+  - agieval_en
+task: agieval_math
+dataset_path: hails/agieval-math
+dataset_name: null
+output_type: generate_until
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "{{query}}"
+doc_to_target: "{{answer}}"
+process_results: !function utils.process_results
+generation_kwargs:
+  max_gen_toks: 32
+  do_sample: False
+  temperature: 0.0
+  until:
+    - "Q:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/agieval/sat-en-without-passage.yaml b/lm_eval/tasks/agieval/sat-en-without-passage.yaml
new file mode 100644
index 0000000000..01490d9ee1
--- /dev/null
+++ b/lm_eval/tasks/agieval/sat-en-without-passage.yaml
@@ -0,0 +1,7 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_nous
+  - agieval_en
+task: agieval_sat_en_without_passage
+dataset_path: hails/agieval-sat-en-without-passage
diff --git a/lm_eval/tasks/agieval/sat-en.yaml b/lm_eval/tasks/agieval/sat-en.yaml
new file mode 100644
index 0000000000..a45dba1507
--- /dev/null
+++ b/lm_eval/tasks/agieval/sat-en.yaml
@@ -0,0 +1,7 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_nous
+  - agieval_en
+task: agieval_sat_en
+dataset_path: hails/agieval-sat-en
diff --git a/lm_eval/tasks/agieval/sat-math.yaml b/lm_eval/tasks/agieval/sat-math.yaml
new file mode 100644
index 0000000000..f5b644ee06
--- /dev/null
+++ b/lm_eval/tasks/agieval/sat-math.yaml
@@ -0,0 +1,7 @@
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_nous
+  - agieval_en
+task: agieval_sat_math
+dataset_path: hails/agieval-sat-math
diff --git a/lm_eval/tasks/agieval/utils.py b/lm_eval/tasks/agieval/utils.py
new file mode 100644
index 0000000000..8fcbdec1fe
--- /dev/null
+++ b/lm_eval/tasks/agieval/utils.py
@@ -0,0 +1,275 @@
+# Answer parsing and normalization code, from
+# https://github.com/ruixiangcui/AGIEval/blob/main/src/
+# math_equivalence.py and post_process.py
+from typing import Dict, List
+
+import re
+
+import numpy as np
+
+def parse_math_answer(raw_string):
+
+    def remove_boxed(s):
+        left = "\\boxed{"
+        try:
+            assert s[:len(left)] == left
+            assert s[-1] == "}"
+            answer = s[len(left):-1]
+            if "=" in answer:
+                answer = answer.split("=")[-1].lstrip(" ")
+            return answer
+        except:
+            return None
+
+    def last_boxed_only_string(string):
+        idx = string.rfind("\\boxed")
+        if idx < 0:
+            idx = string.rfind("\\fbox")
+            if idx < 0:
+                return None
+        i = idx
+        right_brace_idx = None
+        num_left_braces_open = 0
+        while i < len(string):
+            if string[i] == "{":
+                num_left_braces_open += 1
+            if string[i] == "}":
+                num_left_braces_open -= 1
+                if num_left_braces_open == 0:
+                    right_brace_idx = i
+                    break
+            i += 1
+
+        if right_brace_idx == None:
+            retval = None
+        else:
+            retval = string[idx:right_brace_idx + 1]
+
+        return retval
+
+    def get_answer_with_dollar_sign(s):
+        first_pattern = "\$(.*)\$"
+        last_match = None
+        matches = re.findall(first_pattern, s)
+        if matches:
+            last_match = matches[-1]
+            if "=" in last_match:
+                last_match = last_match.split("=")[-1].lstrip(" ")
+        return last_match
+
+    def get_answer_without_dollar_sign(s):
+        last_match = None
+        if "=" in s:
+            last_match = s.split("=")[-1].lstrip(" ").rstrip(".")
+            if "\\n" in last_match:
+                last_match = last_match.split("\\n")[0]
+        else:
+            pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])"
+            matches = re.findall(pattern, s)
+            if matches:
+                last_match = matches[-1]
+        return last_match
+
+    if "\\boxed" in raw_string:
+        answer = remove_boxed(last_boxed_only_string(raw_string))
+    else:
+        answer = get_answer_with_dollar_sign(raw_string)
+        if not answer:
+            answer = get_answer_without_dollar_sign(raw_string)
+    return answer
+
+
+# code from https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except:
+        return string
+
+
+def _remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+
+
+def _fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+
+
+def _strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+    # print(string)
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # print(string)
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # print(string)
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # print(string)
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # print(string)
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = _remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    return string
+
+
+def is_equiv(str1, str2, verbose=False):
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return True
+    if str1 is None or str2 is None:
+        return False
+
+    str1, str2 = parse_math_answer(str1), parse_math_answer(str2)
+
+    try:
+        ss1 = _strip_string(str1)
+        ss2 = _strip_string(str2)
+        if verbose:
+            print(ss1, ss2)
+        return ss1 == ss2
+    except:
+        return str1 == str2
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidate = results[0]
+
+    gold = doc["answer"]
+
+    if not gold:
+        print(doc, candidate, gold)
+    if is_equiv(candidate, gold):
+        retval = 1
+    else:
+        retval = 0
+
+    results = {
+        "acc": retval,
+    }
+    return results
+
+# use a custom process_results() function, because AGIEval can have multiple valid answers
+def process_results_mcqa(doc, results):
+
+        results = [result[0] for result in results]
+
+        gold = doc["gold"]
+
+        acc = 1.0 if int(np.argmax(results)) in gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0
+
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+        }