From f47ffa26e9ae88994beaaa78a708789d636a8e88 Mon Sep 17 00:00:00 2001 From: Yiran Wu <32823396+kevin666aa@users.noreply.github.com> Date: Mon, 18 Dec 2023 09:37:28 -0500 Subject: [PATCH] Add MATH tests to testbed (#914) * add MATH eval to testbed * update --------- Co-authored-by: Qingyun Wu --- .../testbed/includes/math_requirements.txt | 4 + .../tools/testbed/scenarios/MATH/README.md | 27 ++++++ .../tools/testbed/scenarios/MATH/answer.txt | 1 + .../scenarios/MATH/count_correct_math.py | 56 +++++++++++ .../scenarios/MATH/problems_to_json.py | 77 +++++++++++++++ .../tools/testbed/scenarios/MATH/prompt.txt | 1 + .../tools/testbed/scenarios/MATH/scenario.py | 97 +++++++++++++++++++ 7 files changed, 263 insertions(+) create mode 100644 samples/tools/testbed/includes/math_requirements.txt create mode 100644 samples/tools/testbed/scenarios/MATH/README.md create mode 100644 samples/tools/testbed/scenarios/MATH/answer.txt create mode 100644 samples/tools/testbed/scenarios/MATH/count_correct_math.py create mode 100644 samples/tools/testbed/scenarios/MATH/problems_to_json.py create mode 100644 samples/tools/testbed/scenarios/MATH/prompt.txt create mode 100644 samples/tools/testbed/scenarios/MATH/scenario.py diff --git a/samples/tools/testbed/includes/math_requirements.txt b/samples/tools/testbed/includes/math_requirements.txt new file mode 100644 index 000000000000..0600c8ce047a --- /dev/null +++ b/samples/tools/testbed/includes/math_requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/microsoft/autogen.git +sympy +matplotlib +numpy diff --git a/samples/tools/testbed/scenarios/MATH/README.md b/samples/tools/testbed/scenarios/MATH/README.md new file mode 100644 index 000000000000..7fea2cd0f4bd --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/README.md @@ -0,0 +1,27 @@ +## Get json file to run + +This will convert the math problems to json format and put it in the `scenarios/MATH` folder. +```sh +cd samples/tools/testbed/ +python scenarios/MATH/problems_to_json.py +``` + +## Run the testbed + +Note: this will first run autogen on the math problems, and then use a LLM as answer checker to check the answers. +This means the results is not 100% accurate. + +```sh +python run_scenarios.py scenarios/MATH/problems.jsonl -c --requirements math_requirements.txt +``` + +## Get the correct count +Use `--path` or `-p` to specify the path to the problem directory, the default is `./results/problems/`, which is the default save path of this testbed. +```sh +python scenarios/MATH/count_correct_math.py --path +``` + +Example output: +``` +Trial 0 | Total Correct: 10 | Total Problems: 17 +``` diff --git a/samples/tools/testbed/scenarios/MATH/answer.txt b/samples/tools/testbed/scenarios/MATH/answer.txt new file mode 100644 index 000000000000..42844f73f66d --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/answer.txt @@ -0,0 +1 @@ +__ANSWER__ diff --git a/samples/tools/testbed/scenarios/MATH/count_correct_math.py b/samples/tools/testbed/scenarios/MATH/count_correct_math.py new file mode 100644 index 000000000000..69766dfb0c5d --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/count_correct_math.py @@ -0,0 +1,56 @@ +import argparse +import json +import os + + +def main(args): + stars = "*" * 100 + + # initiate the correct count for each trial + correct_count = [0 for i in range(args.num_trials)] + + for i in range(args.num_trials): + for problem_name in os.listdir(args.path): + problem_path = os.path.join(args.path, problem_name, str(i)) + if os.path.isdir(problem_path): + checker_file_path = os.path.join(problem_path, "checker_messages.json") + + with open(checker_file_path, "r") as file: + checker_messages = json.load(file) + + check_result = checker_messages["checker_proxy"][-1]["content"].lower() + + if ( + "the answer is correct" in check_result + or "the answer is approximated but should be correct" in check_result + ): + correct_count[i] += 1 + # print(f"{problem_name} | Correct") + # else: + # print(f"{problem_name} | Wrong") + + print(f"{stars}\nTrial {i} | Total Correct: {correct_count[i]} | Total Problems: {len(os.listdir(args.path))}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Print Math Problems results.""".strip(), + ) + parser.add_argument( + "--path", + "-p", + type=str, + default="./results/problems/", + help="Path to the problems directory", + ) + # num trials + parser.add_argument( + "--num_trials", + "-n", + type=int, + default=1, + help="Number of trials to check", + ) + + args = parser.parse_args() + main(args) diff --git a/samples/tools/testbed/scenarios/MATH/problems_to_json.py b/samples/tools/testbed/scenarios/MATH/problems_to_json.py new file mode 100644 index 000000000000..436217a8c251 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/problems_to_json.py @@ -0,0 +1,77 @@ +import json + +problems = [ + "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.", + "Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]", + "Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north. She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible.", + "For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}", + "If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?", + "Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation.", + "If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$.", + "An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?", + "Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$", + "All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.", + "Let $x$ and $y$ be real numbers. Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]", + "On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?", + "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.", + "There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?", + "Let $z$ be a complex number such that $z^{13} = 1.$ Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$", + "There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat. Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat. How many people are wearing all three items?", + "Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$", +] +answers = [ + # 6 algebra + "(-\\infty, -14)\\cup(-3,\\infty)", + "93", + "4x-5y=-50", + "-5", + "2", + "(-\\infty,0]\\cup[4,\\infty)", + # 11 problems, 2 from each category, (1 algebra is deleted) + "\\frac{10}{9}", + "\\frac{1}{2}", + "-588", + " \\frac{1}{13}", + "\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]", + "23", + "116", + "41", + "43", + "10", + "\\frac{5\\sqrt{42}}{27}", +] + + +def problem_to_json(): + with open("problems.jsonl", "w") as f: + for i, problem in enumerate(problems): + # a = { + # 'id': f'problem{i}', + # 'template': 'scenario.py', + # 'substitutions': { + # '__PROMPT__': problem, + # '__ANSWER__': answers[i], + # }, + # } + a = { + "id": f"problem{i}", + "template": "./", + "substitutions": {"prompt.txt": {"__PROMPT__": problem}, "answer.txt": {"__ANSWER__": answers[i]}}, + } + # Convert the dictionary to a JSON string and write it to the file + json_string = json.dumps(a) + f.write(json_string + "\n") # Add a newline character after each JSON object + + +problem_to_json() + +problems = [] +with open("problems.jsonl", "r") as file: + for line in file: + # Parse each line as a JSON object + problem = json.loads(line) + problems.append(problem) + print(problem["substitutions"]) + print() + +# Now 'problems' is a list of dictionaries, each representing a problem diff --git a/samples/tools/testbed/scenarios/MATH/prompt.txt b/samples/tools/testbed/scenarios/MATH/prompt.txt new file mode 100644 index 000000000000..482f50dca311 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/prompt.txt @@ -0,0 +1 @@ +__PROMPT__ diff --git a/samples/tools/testbed/scenarios/MATH/scenario.py b/samples/tools/testbed/scenarios/MATH/scenario.py new file mode 100644 index 000000000000..89cdfad1aee0 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/scenario.py @@ -0,0 +1,97 @@ +import os +import json +import autogen + +import testbed_utils + +testbed_utils.init() + + +PROMPT = "" +with open("prompt.txt", "rt") as fh: + PROMPT = fh.read() + +ANSWER = "" +with open("answer.txt", "rt") as fh: + ANSWER = fh.read() + + +#################### +config_list = autogen.config_list_from_json( + "OAI_CONFIG_LIST", + filter_dict={"model": ["gpt40613"]}, +) +llm_config = { + "cache_seed": 42, + "config_list": config_list, + "timeout": 600, +} +code_execution_config = { + "work_dir": "coding", + "use_docker": False, # set to True or image name like "python:3" to use docker +} +# ---------between "user" and "assistant"--------- +assistant = autogen.AssistantAgent(name="assistant", llm_config=llm_config) +user_proxy = autogen.UserProxyAgent( + name="user", + human_input_mode="NEVER", + code_execution_config=code_execution_config, + max_consecutive_auto_reply=10, + is_termination_msg=lambda x: x.get("content", "") + and (x.get("content", "").rstrip().endswith("TERMINATE") or x.get("content", "").rstrip().endswith("TERMINATE.")), +) + +user_proxy.initiate_chat(assistant, message=PROMPT) + + +# --------- extract reply --------- +response_with_ans = "" +messages = assistant._oai_messages[user_proxy] +for j in range(len(messages) - 1, -1, -1): + if ( + messages[j]["role"] == "assistant" + and messages[j]["content"].strip() != "TERMINATE" + and messages[j]["content"].strip() != "TERMINATE." + ): + response_with_ans = messages[j]["content"] + break + + +# ---------between "answer_checker" and "checker_proxy"--------- +# define answer checker chat + +check_sys_msg = """You are a helpful AI assistant. You will use your coding and language skills to verify the answer. +You are given: + 1. A problem. + 2. A reply with the answer to the problem. + 3. A ground truth answer. +Please do the following: +1. Extract the answer in the reply: "The answer is ". +2. Check whether the answer in the reply matches the ground truth answer. When comparison is not obvious (for example, 3*\\sqrt(6) and 7.348), you may write code to check the answer and wait for the user to execute the code. +3. After everything is done, please choose a reply from the following options: + - "The answer is correct." + - "The answer is approximated but should be correct. Correct Answer: | Answer extracted: ." + - "The answer is incorrect. Correct Answer: | Answer extracted: ." + - "The reply doesn't contain an answer." """ + +answer_checker = autogen.AssistantAgent(name="checker", llm_config=llm_config, system_message=check_sys_msg) +checker_proxy = autogen.UserProxyAgent( + name="checker_proxy", + human_input_mode="NEVER", + code_execution_config=code_execution_config, + max_consecutive_auto_reply=5, + is_termination_msg=lambda x: x.get("content", "").lower() + and ( + "the answer is correct" in x.get("content", "").lower() + or "the answer is incorrect" in x.get("content", "").lower() + or "the reply doesn't contain an answer" in x.get("content", "").lower() + or "the answer is approximated but should be correct" in x.get("content", "").lower() + ), +) + +message_to_check = "Problem: " + PROMPT + f"\n\nReply: {response_with_ans}\n\nGround truth answer: " + ANSWER +checker_proxy.initiate_chat(answer_checker, message=message_to_check) + + +#################### +testbed_utils.finalize(agents=[assistant, user_proxy, answer_checker, checker_proxy])