From 3f22352755910ce88178731d71c322dfc67e6b36 Mon Sep 17 00:00:00 2001 From: kevin666aa Date: Thu, 7 Dec 2023 23:19:44 -0500 Subject: [PATCH 1/2] add MATH eval to testbed --- .../testbed/includes/math_requirements.txt | 4 + .../tools/testbed/scenarios/MATH/README.md | 17 ++++ .../tools/testbed/scenarios/MATH/answer.txt | 1 + .../scenarios/MATH/count_correct_math.py | 56 +++++++++++ .../testbed/scenarios/MATH/problems.jsonl | 17 ++++ .../scenarios/MATH/problems_to_json.py | 77 +++++++++++++++ .../tools/testbed/scenarios/MATH/prompt.txt | 1 + .../tools/testbed/scenarios/MATH/scenario.py | 98 +++++++++++++++++++ 8 files changed, 271 insertions(+) create mode 100644 samples/tools/testbed/includes/math_requirements.txt create mode 100644 samples/tools/testbed/scenarios/MATH/README.md create mode 100644 samples/tools/testbed/scenarios/MATH/answer.txt create mode 100644 samples/tools/testbed/scenarios/MATH/count_correct_math.py create mode 100644 samples/tools/testbed/scenarios/MATH/problems.jsonl create mode 100644 samples/tools/testbed/scenarios/MATH/problems_to_json.py create mode 100644 samples/tools/testbed/scenarios/MATH/prompt.txt create mode 100644 samples/tools/testbed/scenarios/MATH/scenario.py diff --git a/samples/tools/testbed/includes/math_requirements.txt b/samples/tools/testbed/includes/math_requirements.txt new file mode 100644 index 000000000000..0600c8ce047a --- /dev/null +++ b/samples/tools/testbed/includes/math_requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/microsoft/autogen.git +sympy +matplotlib +numpy diff --git a/samples/tools/testbed/scenarios/MATH/README.md b/samples/tools/testbed/scenarios/MATH/README.md new file mode 100644 index 000000000000..e2d02bc21631 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/README.md @@ -0,0 +1,17 @@ +## Get json file to run + +```sh +cd samples/tools/testbed/ +python scenarios/MATH/problems_to_json.py +``` + +## Run the testbed + +```sh +python run_scenarios.py scenarios/MATH/problems.jsonl -c --requirements math_requirements.txt +``` + +## Get the correct count + +```sh +python scenarios/MATH/count_correct_math.py ``` diff --git a/samples/tools/testbed/scenarios/MATH/answer.txt b/samples/tools/testbed/scenarios/MATH/answer.txt new file mode 100644 index 000000000000..42844f73f66d --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/answer.txt @@ -0,0 +1 @@ +__ANSWER__ diff --git a/samples/tools/testbed/scenarios/MATH/count_correct_math.py b/samples/tools/testbed/scenarios/MATH/count_correct_math.py new file mode 100644 index 000000000000..6f9a1fae1203 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/count_correct_math.py @@ -0,0 +1,56 @@ +import argparse +import json +import os + + +def main(args): + stars = "*" * 100 + + # initiate the correct count for each trial + correct_count = [0 for i in range(args.num_trials)] + + for i in range(args.num_trials): + for problem_name in os.listdir(args.path): + problem_path = os.path.join(args.path, problem_name, str(i)) + if os.path.isdir(problem_path): + checker_file_path = os.path.join(problem_path, "checker_messages.json") + + with open(checker_file_path, "r") as file: + checker_messages = json.load(file) + + check_result = checker_messages["checker_proxy"][-1]["content"].lower() + + if ( + "the answer is correct" in check_result + or "the answer is approximated but should be correct" in check_result + ): + correct_count[i] += 1 + # print(f"{problem_name} | Correct") + # else: + # print(f"{problem_name} | Wrong") + + print(f"{stars}\nTrial {i} | Total Correct: {correct_count[i]} | Total Problems: {len(os.listdir(args.path))}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Print Math Problems results.""".strip(), + ) + parser.add_argument( + "--path", + "-p", + type=str, + default="samples/tools/testbed/scenarios/MATH/problems", + help="Path to the problems directory", + ) + # num trials + parser.add_argument( + "--num_trials", + "-n", + type=int, + default=1, + help="Number of trials to check", + ) + + args = parser.parse_args() + main(args) diff --git a/samples/tools/testbed/scenarios/MATH/problems.jsonl b/samples/tools/testbed/scenarios/MATH/problems.jsonl new file mode 100644 index 000000000000..872d872c7160 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/problems.jsonl @@ -0,0 +1,17 @@ +{"id": "problem0", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation."}, "answer.txt": {"__ANSWER__": "(-\\infty, -14)\\cup(-3,\\infty)"}}} +{"id": "problem1", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]"}, "answer.txt": {"__ANSWER__": "93"}}} +{"id": "problem2", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north. She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible."}, "answer.txt": {"__ANSWER__": "4x-5y=-50"}}} +{"id": "problem3", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}"}, "answer.txt": {"__ANSWER__": "-5"}}} +{"id": "problem4", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?"}, "answer.txt": {"__ANSWER__": "2"}}} +{"id": "problem5", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation."}, "answer.txt": {"__ANSWER__": "(-\\infty,0]\\cup[4,\\infty)"}}} +{"id": "problem6", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$."}, "answer.txt": {"__ANSWER__": "\\frac{10}{9}"}}} +{"id": "problem7", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?"}, "answer.txt": {"__ANSWER__": "\\frac{1}{2}"}}} +{"id": "problem8", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$"}, "answer.txt": {"__ANSWER__": "-588"}}} +{"id": "problem9", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction."}, "answer.txt": {"__ANSWER__": " \\frac{1}{13}"}}} +{"id": "problem10", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Let $x$ and $y$ be real numbers. Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]"}, "answer.txt": {"__ANSWER__": "\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]"}}} +{"id": "problem11", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?"}, "answer.txt": {"__ANSWER__": "23"}}} +{"id": "problem12", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive."}, "answer.txt": {"__ANSWER__": "116"}}} +{"id": "problem13", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?"}, "answer.txt": {"__ANSWER__": "41"}}} +{"id": "problem14", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Let $z$ be a complex number such that $z^{13} = 1.$ Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$"}, "answer.txt": {"__ANSWER__": "43"}}} +{"id": "problem15", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat. Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat. How many people are wearing all three items?"}, "answer.txt": {"__ANSWER__": "10"}}} +{"id": "problem16", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$"}, "answer.txt": {"__ANSWER__": "\\frac{5\\sqrt{42}}{27}"}}} diff --git a/samples/tools/testbed/scenarios/MATH/problems_to_json.py b/samples/tools/testbed/scenarios/MATH/problems_to_json.py new file mode 100644 index 000000000000..436217a8c251 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/problems_to_json.py @@ -0,0 +1,77 @@ +import json + +problems = [ + "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.", + "Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]", + "Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north. She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible.", + "For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}", + "If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?", + "Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation.", + "If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$.", + "An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?", + "Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$", + "All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.", + "Let $x$ and $y$ be real numbers. Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]", + "On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?", + "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.", + "There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?", + "Let $z$ be a complex number such that $z^{13} = 1.$ Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$", + "There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat. Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat. How many people are wearing all three items?", + "Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$", +] +answers = [ + # 6 algebra + "(-\\infty, -14)\\cup(-3,\\infty)", + "93", + "4x-5y=-50", + "-5", + "2", + "(-\\infty,0]\\cup[4,\\infty)", + # 11 problems, 2 from each category, (1 algebra is deleted) + "\\frac{10}{9}", + "\\frac{1}{2}", + "-588", + " \\frac{1}{13}", + "\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]", + "23", + "116", + "41", + "43", + "10", + "\\frac{5\\sqrt{42}}{27}", +] + + +def problem_to_json(): + with open("problems.jsonl", "w") as f: + for i, problem in enumerate(problems): + # a = { + # 'id': f'problem{i}', + # 'template': 'scenario.py', + # 'substitutions': { + # '__PROMPT__': problem, + # '__ANSWER__': answers[i], + # }, + # } + a = { + "id": f"problem{i}", + "template": "./", + "substitutions": {"prompt.txt": {"__PROMPT__": problem}, "answer.txt": {"__ANSWER__": answers[i]}}, + } + # Convert the dictionary to a JSON string and write it to the file + json_string = json.dumps(a) + f.write(json_string + "\n") # Add a newline character after each JSON object + + +problem_to_json() + +problems = [] +with open("problems.jsonl", "r") as file: + for line in file: + # Parse each line as a JSON object + problem = json.loads(line) + problems.append(problem) + print(problem["substitutions"]) + print() + +# Now 'problems' is a list of dictionaries, each representing a problem diff --git a/samples/tools/testbed/scenarios/MATH/prompt.txt b/samples/tools/testbed/scenarios/MATH/prompt.txt new file mode 100644 index 000000000000..482f50dca311 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/prompt.txt @@ -0,0 +1 @@ +__PROMPT__ diff --git a/samples/tools/testbed/scenarios/MATH/scenario.py b/samples/tools/testbed/scenarios/MATH/scenario.py new file mode 100644 index 000000000000..222428bcdb93 --- /dev/null +++ b/samples/tools/testbed/scenarios/MATH/scenario.py @@ -0,0 +1,98 @@ +import os +import json +import autogen + +import testbed_utils + +testbed_utils.init() + + +PROMPT = "" +with open("prompt.txt", "rt") as fh: + PROMPT = fh.read() + +ANSWER = "" +with open("answer.txt", "rt") as fh: + ANSWER = fh.read() + + +#################### +config_list = autogen.config_list_from_json( + "OAI_CONFIG_LIST", + filter_dict={"model": ["gpt40613"]}, +) +print(config_list) +llm_config = { + "cache_seed": 42, + "config_list": config_list, + "timeout": 600, +} +code_execution_config = { + "work_dir": "coding", + "use_docker": False, # set to True or image name like "python:3" to use docker +} +# ---------between "user" and "assistant"--------- +assistant = autogen.AssistantAgent(name="assistant", llm_config=llm_config) +user_proxy = autogen.UserProxyAgent( + name="user", + human_input_mode="NEVER", + code_execution_config=code_execution_config, + max_consecutive_auto_reply=10, + is_termination_msg=lambda x: x.get("content", "") + and (x.get("content", "").rstrip().endswith("TERMINATE") or x.get("content", "").rstrip().endswith("TERMINATE.")), +) + +user_proxy.initiate_chat(assistant, message=PROMPT) + + +# --------- extract reply --------- +response_with_ans = "" +messages = assistant._oai_messages[user_proxy] +for j in range(len(messages) - 1, -1, -1): + if ( + messages[j]["role"] == "assistant" + and messages[j]["content"].strip() != "TERMINATE" + and messages[j]["content"].strip() != "TERMINATE." + ): + response_with_ans = messages[j]["content"] + break + + +# ---------between "answer_checker" and "checker_proxy"--------- +# define answer checker chat + +check_sys_msg = """You are a helpful AI assistant. You will use your coding and language skills to verify the answer. +You are given: + 1. A problem. + 2. A reply with the answer to the problem. + 3. A ground truth answer. +Please do the following: +1. Extract the answer in the reply: "The answer is ". +2. Check whether the answer in the reply matches the ground truth answer. When comparison is not obvious (for example, 3*\\sqrt(6) and 7.348), you may write code to check the answer and wait for the user to execute the code. +3. After everything is done, please choose a reply from the following options: + - "The answer is correct." + - "The answer is approximated but should be correct. Correct Answer: | Answer extracted: ." + - "The answer is incorrect. Correct Answer: | Answer extracted: ." + - "The reply doesn't contain an answer." """ + +answer_checker = autogen.AssistantAgent(name="checker", llm_config=llm_config, system_message=check_sys_msg) +checker_proxy = autogen.UserProxyAgent( + name="checker_proxy", + human_input_mode="NEVER", + code_execution_config=code_execution_config, + max_consecutive_auto_reply=5, + is_termination_msg=lambda x: x.get("content", "").lower() + and ( + "the answer is correct" in x.get("content", "").lower() + or "the answer is incorrect" in x.get("content", "").lower() + or "the reply doesn't contain an answer" in x.get("content", "").lower() + or "the answer is approximated but should be correct" in x.get("content", "").lower() + ), +) + +message_to_check = "Problem: " + PROMPT + f"\n\nReply: {response_with_ans}\n\nGround truth answer: " + ANSWER +checker_proxy.initiate_chat(answer_checker, message=message_to_check) + + +#################### +testbed_utils.finalize(agents=[assistant, user_proxy, answer_checker, checker_proxy]) From 530cb42d2a28df04399d37bac4b2a559333c1ea4 Mon Sep 17 00:00:00 2001 From: kevin666aa Date: Wed, 13 Dec 2023 20:01:10 -0500 Subject: [PATCH 2/2] update --- samples/tools/testbed/scenarios/MATH/README.md | 14 ++++++++++++-- .../scenarios/MATH/count_correct_math.py | 2 +- .../tools/testbed/scenarios/MATH/problems.jsonl | 17 ----------------- .../tools/testbed/scenarios/MATH/scenario.py | 1 - 4 files changed, 13 insertions(+), 21 deletions(-) delete mode 100644 samples/tools/testbed/scenarios/MATH/problems.jsonl diff --git a/samples/tools/testbed/scenarios/MATH/README.md b/samples/tools/testbed/scenarios/MATH/README.md index e2d02bc21631..7fea2cd0f4bd 100644 --- a/samples/tools/testbed/scenarios/MATH/README.md +++ b/samples/tools/testbed/scenarios/MATH/README.md @@ -1,5 +1,6 @@ ## Get json file to run +This will convert the math problems to json format and put it in the `scenarios/MATH` folder. ```sh cd samples/tools/testbed/ python scenarios/MATH/problems_to_json.py @@ -7,11 +8,20 @@ python scenarios/MATH/problems_to_json.py ## Run the testbed +Note: this will first run autogen on the math problems, and then use a LLM as answer checker to check the answers. +This means the results is not 100% accurate. + ```sh python run_scenarios.py scenarios/MATH/problems.jsonl -c --requirements math_requirements.txt ``` ## Get the correct count - +Use `--path` or `-p` to specify the path to the problem directory, the default is `./results/problems/`, which is the default save path of this testbed. ```sh -python scenarios/MATH/count_correct_math.py ``` +python scenarios/MATH/count_correct_math.py --path +``` + +Example output: +``` +Trial 0 | Total Correct: 10 | Total Problems: 17 +``` diff --git a/samples/tools/testbed/scenarios/MATH/count_correct_math.py b/samples/tools/testbed/scenarios/MATH/count_correct_math.py index 6f9a1fae1203..69766dfb0c5d 100644 --- a/samples/tools/testbed/scenarios/MATH/count_correct_math.py +++ b/samples/tools/testbed/scenarios/MATH/count_correct_math.py @@ -40,7 +40,7 @@ def main(args): "--path", "-p", type=str, - default="samples/tools/testbed/scenarios/MATH/problems", + default="./results/problems/", help="Path to the problems directory", ) # num trials diff --git a/samples/tools/testbed/scenarios/MATH/problems.jsonl b/samples/tools/testbed/scenarios/MATH/problems.jsonl deleted file mode 100644 index 872d872c7160..000000000000 --- a/samples/tools/testbed/scenarios/MATH/problems.jsonl +++ /dev/null @@ -1,17 +0,0 @@ -{"id": "problem0", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation."}, "answer.txt": {"__ANSWER__": "(-\\infty, -14)\\cup(-3,\\infty)"}}} -{"id": "problem1", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]"}, "answer.txt": {"__ANSWER__": "93"}}} -{"id": "problem2", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north. She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible."}, "answer.txt": {"__ANSWER__": "4x-5y=-50"}}} -{"id": "problem3", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}"}, "answer.txt": {"__ANSWER__": "-5"}}} -{"id": "problem4", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?"}, "answer.txt": {"__ANSWER__": "2"}}} -{"id": "problem5", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation."}, "answer.txt": {"__ANSWER__": "(-\\infty,0]\\cup[4,\\infty)"}}} -{"id": "problem6", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$."}, "answer.txt": {"__ANSWER__": "\\frac{10}{9}"}}} -{"id": "problem7", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?"}, "answer.txt": {"__ANSWER__": "\\frac{1}{2}"}}} -{"id": "problem8", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$"}, "answer.txt": {"__ANSWER__": "-588"}}} -{"id": "problem9", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction."}, "answer.txt": {"__ANSWER__": " \\frac{1}{13}"}}} -{"id": "problem10", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Let $x$ and $y$ be real numbers. Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]"}, "answer.txt": {"__ANSWER__": "\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]"}}} -{"id": "problem11", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?"}, "answer.txt": {"__ANSWER__": "23"}}} -{"id": "problem12", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive."}, "answer.txt": {"__ANSWER__": "116"}}} -{"id": "problem13", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?"}, "answer.txt": {"__ANSWER__": "41"}}} -{"id": "problem14", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Let $z$ be a complex number such that $z^{13} = 1.$ Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$"}, "answer.txt": {"__ANSWER__": "43"}}} -{"id": "problem15", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat. Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat. How many people are wearing all three items?"}, "answer.txt": {"__ANSWER__": "10"}}} -{"id": "problem16", "template": "./", "substitutions": {"prompt.txt": {"__PROMPT__": "Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$"}, "answer.txt": {"__ANSWER__": "\\frac{5\\sqrt{42}}{27}"}}} diff --git a/samples/tools/testbed/scenarios/MATH/scenario.py b/samples/tools/testbed/scenarios/MATH/scenario.py index 222428bcdb93..89cdfad1aee0 100644 --- a/samples/tools/testbed/scenarios/MATH/scenario.py +++ b/samples/tools/testbed/scenarios/MATH/scenario.py @@ -21,7 +21,6 @@ "OAI_CONFIG_LIST", filter_dict={"model": ["gpt40613"]}, ) -print(config_list) llm_config = { "cache_seed": 42, "config_list": config_list,