Skip to content

Commit 3dcd015

Browse files
authored
Merge pull request #60 from CaraJ7/main
Add MathVerse
2 parents 95df9fe + 743673a commit 3dcd015

10 files changed

+652
-0
lines changed

Diff for: lmms_eval/tasks/mathverse/mathverse.yaml

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
group: mathverse
2+
task:
3+
- mathverse_testmini
4+
- mathverse_testmini_text_only
5+
- mathverse_testmini_text_lite
6+
- mathverse_testmini_text_dominant
7+
- mathverse_testmini_vision_intensive
8+
- mathverse_testmini_vision_dominant
9+
- mathverse_testmini_vision_only
10+
metadata:
11+
version: 0.0
12+
gpt_eval_model_name: "gpt-3.5-turbo"
13+
trunk_response: 30
14+
quick_match: false

Diff for: lmms_eval/tasks/mathverse/mathverse_evals.py

+306
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
import time
2+
import requests
3+
import logging
4+
from tqdm import tqdm
5+
import pandas as pd
6+
7+
eval_logger = logging.getLogger("lmms-eval")
8+
9+
DEMO_PROMPT_EXTRACT = """
10+
I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.
11+
12+
1.
13+
Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
14+
Extracted Answer: (-2, 1)
15+
16+
2.
17+
Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
18+
Extracted Answer: D
19+
20+
3.
21+
Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
22+
Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
23+
24+
4.
25+
Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
26+
Extracted Answer: null
27+
28+
5.
29+
Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
30+
Extracted answer: 22.3
31+
32+
6.
33+
Model response: have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
34+
Extracted answer: f(x) = -x^2 - 2x + 1
35+
36+
7.
37+
"""
38+
39+
DEMO_PROMPT_SCORE = """
40+
Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent.
41+
Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
42+
If they are consistent, Judement is 1; if they are different, Judement is 0.
43+
44+
[Question]: Write the set of numbers represented on the number line in interval notation.
45+
[Standard Answer]: (-2,1]
46+
[Model_answer] : Extracted Answer: \\((-2, 1)\\)
47+
Judgement: 0
48+
49+
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
50+
[Standard Answer]: C
51+
[Model_answer] : B:2\u221a{{3}}
52+
Judgement: 0
53+
54+
[Question]: Find the domain and range of the function f using interval notation.
55+
[Standard Answer]: domain: [-4, 0) and range: (-3, 1]
56+
[Model_answer] : Range: \\((-4, 1]\\)
57+
Judgement: 0
58+
59+
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
60+
[Standard Answer]: C
61+
[Model_answer] : null
62+
Judgement: 0
63+
64+
[Question]: Given the graph of the ellipse that intersects with x-axis at 9 and -9 and with y-axis at 3 and -3, determine its equation.A. \\frac{{x^2}}{{81}} + \\frac{{y^2}}{{9}} = 1 B. Can not determine.\n
65+
[Standard Answer]: A
66+
[Model_answer] : \\frac{{x^2}}{{81}} + \\frac{{y^2}}{{9}} = 1
67+
Judgement: 1
68+
69+
[Question]: {question}
70+
[Standard Answer]: {gt}
71+
[Model_answer] : {extraction}
72+
Judgement: """
73+
74+
class MathVerseEvaluator:
75+
API_URL = "https://api.openai.com/v1/chat/completions"
76+
77+
def __init__(self, api_key, gpt_model="gpt-3.5-turbo"):
78+
self.api_key = api_key
79+
self.gpt_model = gpt_model
80+
81+
def _post_request(self, payload):
82+
headers = {
83+
"Authorization": f"Bearer {self.api_key}",
84+
"Content-Type": "application/json",
85+
}
86+
response = requests.post(self.API_URL, headers=headers, json=payload, timeout=30)
87+
response.raise_for_status()
88+
return response.json()
89+
90+
def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=10000000, sleep_time=0):
91+
messages = [
92+
{"role": "user", "content": prompt},
93+
]
94+
payload = {"model": self.gpt_model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "n": n}
95+
96+
while patience > 0:
97+
patience -= 1
98+
try:
99+
response = self._post_request(payload)
100+
if n == 1:
101+
prediction = response["choices"][0]["message"]["content"].strip()
102+
if prediction and prediction != "":
103+
return prediction
104+
else:
105+
prediction = [choice["message"]["content"].strip() for choice in response["choices"]]
106+
if prediction and prediction[0] != "":
107+
return prediction
108+
109+
except Exception as e:
110+
# some model may output repetitive answer, which ChatGPT will throw an error.
111+
if 'repetitive patterns' in str(e):
112+
print(str(e))
113+
print("Continue with empty answer")
114+
return ""
115+
# some answer may contain some sensitive words, like 'test'
116+
if 'sensitive' in str(e) or '400' in str(e):
117+
print(str(e))
118+
print("Continue with empty answer")
119+
return "0"
120+
121+
if "Rate limit" not in str(e):
122+
eval_logger.error(e)
123+
124+
if "Please reduce the length of the messages" in str(e):
125+
eval_logger.error("!!Reduce prompt size")
126+
# reduce input prompt and keep the tail
127+
new_size = int(len(prompt) * 0.9)
128+
new_start = len(prompt) - new_size
129+
prompt = prompt[new_start:]
130+
payload["messages"] = [
131+
{"role": "user", "content": prompt},
132+
]
133+
134+
if sleep_time > 0:
135+
time.sleep(sleep_time)
136+
return ""
137+
138+
def verify_extraction(self, extraction):
139+
extraction = extraction.strip()
140+
if not extraction:
141+
return False
142+
return True
143+
144+
def create_extract_prompt(self, demo_prompt, response):
145+
demo_prompt = demo_prompt.strip()
146+
test_prompt = f"Model response: '{response}'\nExtracted Answer: "
147+
full_prompt = f"{demo_prompt}\n\n{test_prompt}"
148+
return full_prompt
149+
150+
def create_match_prompt(self, demo_prompt, question, answer, extraction):
151+
demo_prompt = demo_prompt.strip()
152+
full_prompt = demo_prompt.format(question=question, gt=answer, extraction=extraction)
153+
return full_prompt
154+
155+
def extract_answer(self, response):
156+
157+
if not response:
158+
return ""
159+
160+
# general extraction
161+
try:
162+
full_prompt = self.create_extract_prompt(DEMO_PROMPT_EXTRACT, response)
163+
extraction = self.get_chat_response(full_prompt, temperature=0, max_tokens=256, n=1)
164+
return extraction
165+
except Exception as e:
166+
eval_logger.error(e)
167+
eval_logger.error(f"Error in extracting answer for problem")
168+
169+
return ""
170+
171+
def score_answer(self, question, answer, extraction, quick_match=False):
172+
if quick_match:
173+
return extraction == answer
174+
175+
try:
176+
full_prompt = self.create_match_prompt(DEMO_PROMPT_SCORE, question, answer, extraction)
177+
while True:
178+
extraction = self.get_chat_response(full_prompt, temperature=0, max_tokens=8, n=1)
179+
judgement = extraction.replace("Judgement:", "").strip()
180+
if judgement.strip() in ['0', '1']:
181+
return int(judgement) == 1
182+
183+
except Exception as e:
184+
print(e)
185+
print(f"Error in matching answer")
186+
187+
return False
188+
189+
190+
def get_acc_with_contion(self, res_pd, key, value):
191+
"""
192+
Calculate the accuracy of predictions with a specific condition
193+
"""
194+
total_pd = res_pd[res_pd[key] == value]
195+
196+
correct_pd = total_pd[total_pd["true_false"] == True]
197+
acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100) if len(total_pd) > 0 else "0.00"
198+
return len(correct_pd), len(total_pd), acc
199+
200+
def create_one_query(self, problem, shot_type, hint, query_type, examples=None, shot_num=0, use_caption=False, use_ocr=False):
201+
### [1] Demo prompt
202+
if shot_num == 0:
203+
demo_prompt = ""
204+
else:
205+
demos = []
206+
shot_num = min(shot_num, len(examples))
207+
for example in examples[:shot_num]:
208+
prompt = ""
209+
210+
# question
211+
prompt += f"Question: {example[query_type]}"
212+
213+
# solution
214+
if shot_type == "solution":
215+
solution = example["solution"].strip()
216+
prompt += "\n" + f"Solution: {solution}"
217+
218+
# step-by-step
219+
if shot_type == "step-by-step":
220+
solution = example["solution"].strip()
221+
prompt += "\n" + f"{solution}"
222+
223+
# direct
224+
if shot_type == "direct":
225+
solution = example["solution"].strip()
226+
prompt += "\n" + f"{solution}"
227+
228+
demos.append(prompt)
229+
230+
demo_prompt = "\n\n".join(demos)
231+
232+
### [2] Test query
233+
# problem info
234+
question = problem["question"]
235+
question_type = problem["question_type"]
236+
237+
# hint
238+
# format-prompt
239+
if shot_type == "format-prompt":
240+
hint_text = ""
241+
# custom-prompt
242+
elif shot_type == "custom-prompt":
243+
if question_type == 'multi-choice':
244+
hint_text = hint['multi-choice']
245+
else: # free-form
246+
hint_text = hint['free-form']
247+
248+
# question
249+
if shot_type == "format-prompt":
250+
question_text = f"{problem[query_type]}"
251+
elif shot_type == "custom-prompt":
252+
question_text = f"Question: {question}"
253+
254+
elements = [hint_text, question_text]
255+
test_query = "\n".join([e for e in elements if e != ""])
256+
257+
### [3] Final query
258+
query = demo_prompt + "\n\n" + test_query
259+
query = query.strip()
260+
return query
261+
262+
def eval_results(self, results, config):
263+
# extract and score for each question
264+
for inst in tqdm(results):
265+
full_prediction = inst['prediction'].strip()
266+
problem = {
267+
"question_type": inst["question_type"],
268+
"answer": inst["answer"] if "answer" in inst else None,
269+
"question_for_eval": inst["question_for_eval"],
270+
}
271+
if config['metadata'].get('trunk_response', -1) > 0:
272+
prediction = ' '.join(full_prediction.split(' ')[-config['metadata']['trunk_response']:])
273+
else:
274+
prediction = full_prediction
275+
extraction = self.extract_answer(prediction)
276+
# set test set answer to None
277+
true_false = self.score_answer(problem['question_for_eval'], problem["answer"], extraction, config['metadata']['quick_match']) if problem["answer"] is not None else False
278+
279+
inst['extraction'] = extraction
280+
inst['prediction'] = prediction
281+
inst['true_false'] = true_false
282+
283+
# calculate total scores
284+
sample_index = [result["sample_index"] for result in results]
285+
total = len(results)
286+
correct = sum(1 for idx, pid in enumerate(sample_index) if results[idx]["true_false"])
287+
accuracy = round(correct / total * 100, 2)
288+
scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}}
289+
290+
for result in results:
291+
result.update(result.pop("metadata"))
292+
293+
results_dict = {result["sample_index"]: result for result in results}
294+
df = pd.DataFrame(results_dict).T
295+
target_keys = ["problem_version", "subfield"]
296+
297+
for key in target_keys:
298+
values = df[key].unique()
299+
scores[key] = {}
300+
for value in values:
301+
correct, total, acc = self.get_acc_with_contion(df, key, value)
302+
if total > 0:
303+
scores[key][value] = {"accuracy": acc, "correct": correct, "total": total}
304+
scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]["accuracy"]), reverse=True))
305+
306+
return results_dict, scores

Diff for: lmms_eval/tasks/mathverse/mathverse_testmini.yaml

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
dataset_path: CaraJ/MathVerse-lmmseval
2+
dataset_name: testmini
3+
dataset_kwargs:
4+
token: False
5+
task: "mathverse_testmini"
6+
test_split: testmini
7+
output_type: generate_until
8+
doc_to_visual: !function utils.mathverse_doc_to_visual
9+
doc_to_text: !function utils.mathverse_doc_to_text
10+
doc_to_target: "answer"
11+
generation_kwargs:
12+
until:
13+
- "ASSISTANT:"
14+
max_new_tokens: 1024
15+
temperature: 0
16+
top_p: 0
17+
num_beams: 1
18+
do_sample: false
19+
process_results: !function utils.mathverse_process_results
20+
metric_list:
21+
- metric: gpt_eval_score
22+
aggregation: !function utils.mathverse_aggregate_results_eval
23+
higher_is_better: true
24+
- metric: submission
25+
aggregation: !function utils.mathverse_aggregate_results_submission
26+
higher_is_better: true
27+
28+
model_specific_prompt_kwargs:
29+
default:
30+
shot_type: "format-prompt" # can also be "custom-prompt"
31+
query_type: "query_wo" # now only support query_wo
32+
model_specific_generation_kwargs:
33+
llava:
34+
image_aspect_ratio: original
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
dataset_path: CaraJ/MathVerse-lmmseval
2+
dataset_name: testmini_version_split
3+
dataset_kwargs:
4+
token: False
5+
task: "mathverse_testmini_text_dominant"
6+
test_split: text_dominant
7+
output_type: generate_until
8+
doc_to_visual: !function utils.mathverse_doc_to_visual
9+
doc_to_text: !function utils.mathverse_doc_to_text
10+
doc_to_target: "answer"
11+
generation_kwargs:
12+
until:
13+
- "ASSISTANT:"
14+
max_new_tokens: 1024
15+
temperature: 0
16+
top_p: 0
17+
num_beams: 1
18+
do_sample: false
19+
process_results: !function utils.mathverse_process_results
20+
metric_list:
21+
- metric: gpt_eval_score
22+
aggregation: !function utils.mathverse_aggregate_results_eval
23+
higher_is_better: true
24+
- metric: submission
25+
aggregation: !function utils.mathverse_aggregate_results_submission
26+
higher_is_better: true
27+
28+
model_specific_prompt_kwargs:
29+
default:
30+
shot_type: "format-prompt" # can also be "custom-prompt"
31+
query_type: "query_wo" # now only support query_wo
32+
model_specific_generation_kwargs:
33+
llava:
34+
image_aspect_ratio: original

0 commit comments

Comments
 (0)