forked from microsoft/autogen
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add MATH tests to testbed (microsoft#914)
* add MATH eval to testbed * update --------- Co-authored-by: Qingyun Wu <[email protected]>
- Loading branch information
Showing
7 changed files
with
263 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
git+https://github.com/microsoft/autogen.git | ||
sympy | ||
matplotlib | ||
numpy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
## Get json file to run | ||
|
||
This will convert the math problems to json format and put it in the `scenarios/MATH` folder. | ||
```sh | ||
cd samples/tools/testbed/ | ||
python scenarios/MATH/problems_to_json.py | ||
``` | ||
|
||
## Run the testbed | ||
|
||
Note: this will first run autogen on the math problems, and then use a LLM as answer checker to check the answers. | ||
This means the results is not 100% accurate. | ||
|
||
```sh | ||
python run_scenarios.py scenarios/MATH/problems.jsonl -c <config_list> --requirements math_requirements.txt | ||
``` | ||
|
||
## Get the correct count | ||
Use `--path` or `-p` to specify the path to the problem directory, the default is `./results/problems/`, which is the default save path of this testbed. | ||
```sh | ||
python scenarios/MATH/count_correct_math.py --path <path_to_problem_dir> | ||
``` | ||
|
||
Example output: | ||
``` | ||
Trial 0 | Total Correct: 10 | Total Problems: 17 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__ANSWER__ |
56 changes: 56 additions & 0 deletions
56
samples/tools/testbed/scenarios/MATH/count_correct_math.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import argparse | ||
import json | ||
import os | ||
|
||
|
||
def main(args): | ||
stars = "*" * 100 | ||
|
||
# initiate the correct count for each trial | ||
correct_count = [0 for i in range(args.num_trials)] | ||
|
||
for i in range(args.num_trials): | ||
for problem_name in os.listdir(args.path): | ||
problem_path = os.path.join(args.path, problem_name, str(i)) | ||
if os.path.isdir(problem_path): | ||
checker_file_path = os.path.join(problem_path, "checker_messages.json") | ||
|
||
with open(checker_file_path, "r") as file: | ||
checker_messages = json.load(file) | ||
|
||
check_result = checker_messages["checker_proxy"][-1]["content"].lower() | ||
|
||
if ( | ||
"the answer is correct" in check_result | ||
or "the answer is approximated but should be correct" in check_result | ||
): | ||
correct_count[i] += 1 | ||
# print(f"{problem_name} | Correct") | ||
# else: | ||
# print(f"{problem_name} | Wrong") | ||
|
||
print(f"{stars}\nTrial {i} | Total Correct: {correct_count[i]} | Total Problems: {len(os.listdir(args.path))}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="""Print Math Problems results.""".strip(), | ||
) | ||
parser.add_argument( | ||
"--path", | ||
"-p", | ||
type=str, | ||
default="./results/problems/", | ||
help="Path to the problems directory", | ||
) | ||
# num trials | ||
parser.add_argument( | ||
"--num_trials", | ||
"-n", | ||
type=int, | ||
default=1, | ||
help="Number of trials to check", | ||
) | ||
|
||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import json | ||
|
||
problems = [ | ||
"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.", | ||
"Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]", | ||
"Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north. She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible.", | ||
"For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}", | ||
"If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?", | ||
"Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation.", | ||
"If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$.", | ||
"An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?", | ||
"Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$", | ||
"All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.", | ||
"Let $x$ and $y$ be real numbers. Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]", | ||
"On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?", | ||
"Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.", | ||
"There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?", | ||
"Let $z$ be a complex number such that $z^{13} = 1.$ Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$", | ||
"There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat. Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat. How many people are wearing all three items?", | ||
"Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$", | ||
] | ||
answers = [ | ||
# 6 algebra | ||
"(-\\infty, -14)\\cup(-3,\\infty)", | ||
"93", | ||
"4x-5y=-50", | ||
"-5", | ||
"2", | ||
"(-\\infty,0]\\cup[4,\\infty)", | ||
# 11 problems, 2 from each category, (1 algebra is deleted) | ||
"\\frac{10}{9}", | ||
"\\frac{1}{2}", | ||
"-588", | ||
" \\frac{1}{13}", | ||
"\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]", | ||
"23", | ||
"116", | ||
"41", | ||
"43", | ||
"10", | ||
"\\frac{5\\sqrt{42}}{27}", | ||
] | ||
|
||
|
||
def problem_to_json(): | ||
with open("problems.jsonl", "w") as f: | ||
for i, problem in enumerate(problems): | ||
# a = { | ||
# 'id': f'problem{i}', | ||
# 'template': 'scenario.py', | ||
# 'substitutions': { | ||
# '__PROMPT__': problem, | ||
# '__ANSWER__': answers[i], | ||
# }, | ||
# } | ||
a = { | ||
"id": f"problem{i}", | ||
"template": "./", | ||
"substitutions": {"prompt.txt": {"__PROMPT__": problem}, "answer.txt": {"__ANSWER__": answers[i]}}, | ||
} | ||
# Convert the dictionary to a JSON string and write it to the file | ||
json_string = json.dumps(a) | ||
f.write(json_string + "\n") # Add a newline character after each JSON object | ||
|
||
|
||
problem_to_json() | ||
|
||
problems = [] | ||
with open("problems.jsonl", "r") as file: | ||
for line in file: | ||
# Parse each line as a JSON object | ||
problem = json.loads(line) | ||
problems.append(problem) | ||
print(problem["substitutions"]) | ||
print() | ||
|
||
# Now 'problems' is a list of dictionaries, each representing a problem |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__PROMPT__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import os | ||
import json | ||
import autogen | ||
|
||
import testbed_utils | ||
|
||
testbed_utils.init() | ||
|
||
|
||
PROMPT = "" | ||
with open("prompt.txt", "rt") as fh: | ||
PROMPT = fh.read() | ||
|
||
ANSWER = "" | ||
with open("answer.txt", "rt") as fh: | ||
ANSWER = fh.read() | ||
|
||
|
||
#################### | ||
config_list = autogen.config_list_from_json( | ||
"OAI_CONFIG_LIST", | ||
filter_dict={"model": ["gpt40613"]}, | ||
) | ||
llm_config = { | ||
"cache_seed": 42, | ||
"config_list": config_list, | ||
"timeout": 600, | ||
} | ||
code_execution_config = { | ||
"work_dir": "coding", | ||
"use_docker": False, # set to True or image name like "python:3" to use docker | ||
} | ||
# ---------between "user" and "assistant"--------- | ||
assistant = autogen.AssistantAgent(name="assistant", llm_config=llm_config) | ||
user_proxy = autogen.UserProxyAgent( | ||
name="user", | ||
human_input_mode="NEVER", | ||
code_execution_config=code_execution_config, | ||
max_consecutive_auto_reply=10, | ||
is_termination_msg=lambda x: x.get("content", "") | ||
and (x.get("content", "").rstrip().endswith("TERMINATE") or x.get("content", "").rstrip().endswith("TERMINATE.")), | ||
) | ||
|
||
user_proxy.initiate_chat(assistant, message=PROMPT) | ||
|
||
|
||
# --------- extract reply --------- | ||
response_with_ans = "" | ||
messages = assistant._oai_messages[user_proxy] | ||
for j in range(len(messages) - 1, -1, -1): | ||
if ( | ||
messages[j]["role"] == "assistant" | ||
and messages[j]["content"].strip() != "TERMINATE" | ||
and messages[j]["content"].strip() != "TERMINATE." | ||
): | ||
response_with_ans = messages[j]["content"] | ||
break | ||
|
||
|
||
# ---------between "answer_checker" and "checker_proxy"--------- | ||
# define answer checker chat | ||
|
||
check_sys_msg = """You are a helpful AI assistant. You will use your coding and language skills to verify the answer. | ||
You are given: | ||
1. A problem. | ||
2. A reply with the answer to the problem. | ||
3. A ground truth answer. | ||
Please do the following: | ||
1. Extract the answer in the reply: "The answer is <answer extracted>". | ||
2. Check whether the answer in the reply matches the ground truth answer. When comparison is not obvious (for example, 3*\\sqrt(6) and 7.348), you may write code to check the answer and wait for the user to execute the code. | ||
3. After everything is done, please choose a reply from the following options: | ||
- "The answer is correct." | ||
- "The answer is approximated but should be correct. Correct Answer: <ground truth answer> | Answer extracted: <answer extracted>." | ||
- "The answer is incorrect. Correct Answer: <ground truth answer> | Answer extracted: <answer extracted>." | ||
- "The reply doesn't contain an answer." """ | ||
|
||
answer_checker = autogen.AssistantAgent(name="checker", llm_config=llm_config, system_message=check_sys_msg) | ||
checker_proxy = autogen.UserProxyAgent( | ||
name="checker_proxy", | ||
human_input_mode="NEVER", | ||
code_execution_config=code_execution_config, | ||
max_consecutive_auto_reply=5, | ||
is_termination_msg=lambda x: x.get("content", "").lower() | ||
and ( | ||
"the answer is correct" in x.get("content", "").lower() | ||
or "the answer is incorrect" in x.get("content", "").lower() | ||
or "the reply doesn't contain an answer" in x.get("content", "").lower() | ||
or "the answer is approximated but should be correct" in x.get("content", "").lower() | ||
), | ||
) | ||
|
||
message_to_check = "Problem: " + PROMPT + f"\n\nReply: {response_with_ans}\n\nGround truth answer: " + ANSWER | ||
checker_proxy.initiate_chat(answer_checker, message=message_to_check) | ||
|
||
|
||
#################### | ||
testbed_utils.finalize(agents=[assistant, user_proxy, answer_checker, checker_proxy]) |