-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLLM_eval.py
330 lines (269 loc) · 16.5 KB
/
LLM_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import datasets
import openai
import os
import time
import sys
import random
import csv
import re
import statistics
import argparse
from datasets import load_dataset, load_dataset_builder, get_dataset_split_names, Dataset
from roundtable import roundTable
import utils
from utils import getResponse, grade_answer, llm_judge
# openai.api_key = os.getenv("OPENAI_API_KEY")
#System messages for question answering
system_messages = {'normal': "You are taking a test. Provide your answers by responding only with the number of the appropriate answer for the presented question",
'researcher': "Act as a researcher with an IQ of 180 that is an expert at problem solving, common sense reasoning, and strategy. You are taking a test. Provide your answers by responding only with the number of the appropriate answer for the presented question",
'persona': "You are taking a test. Act as the persona provided and provide your answers by responding only with the number of the appropriate answer for the presented question",
'roundtable_admin_initial': "You are taking a test. Provide your answers by responding with the number of the appropriate answer for the presented question as well as your reasoning for choosing it.",
'roundtable_expert': "You are {}, also referred to as {}.\n You are assisting the administrator in taking a test by offering useful critique and information. Provide feedback on the most recent answer given by the administrator, as well as their reasoning and offer suggested changes if you think the answer is incorrected, as well as your reasoning why. Pay attention to the feedback of any other experts and correct any incorrect information or suggestions. ((Be succinct and only suggest answers that are provided by the question. Do not provide overly long feedback. Do not exceed 1500 characters in your response))",
'roundtable_admin_revisor': "You are taking a test. Revise the previous answer according to the feedback provided by the experts you are collaborating with. ((You are not allowed to change the answers to the question, only the choice of answer you make.))",
'roundtable_admin_decider': "You are taking a test. Decide the best answer given the feedback and revisions that have been made. ((Provide your answers by responding only with the number of the appropriate answer for the presented question.))",
'roundtable_creator':"""You are an expert at creating useful personas. You create detailed personas of useful experts for answering the questions you are given.
((When you return the personas, be sure to seperate them with a sequence of two newlines, followed by 5 dashes, followed by two newlines. For example:
Persona description 1
-----
Persona description 2))
""",
'clean_dataset': "nothing",
'judge': """You are a test proctor. You will recieve a question, response and correct answer and you should respond with a number from 1 through 10 that rates how well the response captures the main points of the correct answer where 1 is not at all and 10 is completely.
The following is an example:
Question: What is the meaning of life?
Correct answer: Generally one should seek to improve the world around them by being a good steward of nature and improving the lives of other people.
Response: People should take care of their environment and do their best to help each other
Grade: 10
Another example with the same question follows:
Question: What is the meaning of life?
Correct answer: Generally one should seek to improve the world around them by being a good steward of nature and improving the lives of other people.
Response: People should be good to each other
Grade: 6
Another example with the same question follows:
Question: What is the meaning of life?
Correct answer: Generally one should seek to improve the world around them by being a good steward of nature and improving the lives of other people.
Response: Doing what makes you feel good
Grade: 3
Another example with the same question follows:
Question: What is the meaning of life?
Correct answer: Generally one should seek to improve the world around them by being a good steward of nature and improving the lives of other people.
Response: Breaking rocks
Grade: 1
(((Your responses should always be formatted by two newlines, then the string 'Grade:' followed by the number grade for the given response. Respond with nothing other than what was just detailed.)))""",
'normal_long': "You are taking a test. Provide your answers by responding with one to two sentences.",
'researcher_long': "Act as a researcher with an IQ of 180 that is an expert at problem solving, common sense reasoning, and strategy. You are taking a test. Provide your answers by responding with one to two sentences.",
'persona_long': "You are taking a test. Act as the persona provided and provide your answers by responding with one to two sentences.",
'roundtable_admin_initial_long': "You are taking a test. Provide your answers by responding with one to two sentences as well as separately providing your reasoning for your answer.",
'roundtable_expert_long': "You are {}, also referred to as {}.\n You are assisting the administrator in taking a test by offering useful critique and information. Provide feedback on the most recent answer given by the administrator, as well as their reasoning and offer suggested changes if you think the answer is incorrected, as well as your reasoning why. Pay attention to the feedback of any other experts and correct any incorrect information or suggestions. ((Be succinct. Do not provide overly long feedback. Do not exceed 1500 characters in your response))",
'roundtable_admin_revisor_long': "You are taking a test. Revise the previous answer according to the feedback provided by the experts you are collaborating with.",
'roundtable_admin_decider_long': "You are taking a test. Decide the best answer given the feedback and revisions that have been made. ((Provide your answers by responding with one to two sentences.))",}
def takeTest(dataset_name="commonsense_qa", style="normal", question_limit=100, grading=True, output_file=None, judge_evaluations = 1):
#Initialize logging file
if output_file:
recording_file = open(output_file + "_{}_{}_{}_{}questions.csv".format(style, dataset_name, question_limit, utils.model_selection),'w', encoding="utf-8", newline='')
record = csv.writer(recording_file)
record.writerow(['Prompt','Answer','Correct answer','Evaluation'])
#Load and shuffle dataset, select a section based on question limit
if dataset_name == "commonsense_qa":
dataset = load_dataset("commonsense_qa", split='train')
elif dataset_name == "ai2_arc":
dataset = load_dataset("ai2_arc", 'ARC-Challenge', split='train', ignore_verifications=True)
elif dataset_name == "mmlu":
dataset = load_dataset("cais/mmlu", "all", split="auxiliary_train")
elif dataset_name == "databricks":
dataset = load_dataset("databricks/databricks-dolly-15k", split='train')
elif dataset_name == "databricks_sub":
dataset = load_dataset("databricks/databricks-dolly-15k", split='train')
desired_categories = ["creative_writing", "brainstorming"]
to_remove = []
large_count = 0
for entry in dataset:
if not entry['category'] in desired_categories:
large_count += 1
to_remove.append(entry)
print(large_count)
dataset = dataset.to_list()
for entry in to_remove:
dataset.remove(entry)
dataset = Dataset.from_list(dataset)
elif dataset_name == "LongForm":
dataset = load_dataset("akoksal/LongForm", split='train')
to_remove = []
large_count = 0
for entry in dataset:
if len(entry['output']) > 2000 or len(entry['input']) > 2000:
large_count += 1
to_remove.append(entry)
print(large_count)
dataset = dataset.to_list()
for entry in to_remove:
dataset.remove(entry)
dataset = Dataset.from_list(dataset)
elif dataset_name == "FinTalk":
dataset = load_dataset("ceadar-ie/FinTalk-19k", split='train')
else:
dataset = load_dataset("pubmed_qa", 'pqa_artificial')
# if dataset_name == "LongForm":
shuffle_seed = random.randint(0,1000)
shuffled_dataset = dataset.shuffle(seed=shuffle_seed)
selection = shuffled_dataset.select([i for i in range(min(question_limit, len(shuffled_dataset)))])
correct = 0
incorrect = 0
invalid = 0
aggregate_scores = []
for row in selection:
record_row = []
if dataset_name in ["pubmed_qa","databricks", "databricks_sub", "LongForm", "FinTalk"]:
choices = ["yes" ,"no", "maybe"]
else:
choices = row['choices']
if dataset_name in ("commonsense_qa", "ai2_arc"):
choices = choices["text"]
if dataset_name == "commonsense_qa":
print(choices)
content = "{} \n1. {} \n2. {} \n3. {} \n4. {} \n5. {}".format(row['question'], choices[0],
choices[1], choices[2], choices[3], choices[4])
elif dataset_name == "pubmed_qa":
content = ""
for context in row['context']['contexts']:
content += context
content += ".\n\n" +row['question'] + "\n"
if not (style[-4:] =="long"):
for i in range(len(choices)):
content+= "{}. {} \n".format(i+1, choices[i])
elif dataset_name in ["databricks", "databricks_sub"]:
content = row["context"]
content += "\n\n" + row["instruction"]
elif dataset_name == "LongForm":
content = row["input"]
elif dataset_name == "FinTalk":
content = f"{row['context']}\n{row['instruction']}"
else:
content = row['question'] + "\n"
# print(row)
for i in range(len(choices)):
content+= "{}. {} \n".format(i+1, choices[i])
# print("Sending question {} of {}".format((correct + incorrect + 1), len(selection)))
# print(content)
if style == "roundtable":
LLM_response = roundTable(content, record=record, record_file=recording_file)
elif style == "roundtable_long":
LLM_response = roundTable(content, record=record, record_file=recording_file, style="long")
elif style == "clean_dataset":
correct = 0
incorrect = 1
continue
else:
if style in ["persona", "persona_long"]:
if dataset_name in ["LongForm"]:
persona_content = "Describe a detailed persona of an expert who would be able to answer the following question:\n {}".format(content)
else:
persona_content = "Describe a detailed persona of an expert who would be able to answer the following question:\n {}".format(content)
messages =[
{"role": "system", "content": "You are an expert at describing personas. Return a detailed description of only the persona that was requested."},
{"role": "user", "content": persona_content}
]
response = getResponse(messages)
LLM_response = response
content = "Act as {} when answering the following question:\n".format(LLM_response) + "\n" + content
messages =[
{"role": "system", "content": system_messages[style]},
{"role": "user", "content": content}
]
print("Sending question {} of {}".format((correct + incorrect + 1), len(selection)))
print(content)
LLM_response = getResponse(messages)
grading_style = "long" if style[-4:] == "long" else "mc"
# print(f"grading_style: {grading_style}")
# print(style[-4:])
if grading_style == "long":
correct, incorrect, invalid, rating = grade_answer(LLM_response, dataset_name, row, record, content, output_file, correct, incorrect, invalid, style=grading_style, judge_eval_iterations = judge_evaluations)
aggregate_scores.append(rating)
else:
correct, incorrect, invalid = grade_answer(LLM_response, dataset_name, row, record, content, output_file, correct, incorrect, invalid, style=grading_style)
recording_file.flush()
time.sleep(3)
print("""
Total score: {}/{}
Percentage: {}
""".format(correct, (incorrect + correct), str(float(correct)/(incorrect + correct))))
record.writerow(["Total", "{}/{}".format(correct, (incorrect + correct)), "Percentage", str(float(correct)/(incorrect + correct)),
"Incorrect", str(incorrect-invalid), "Invalid", str(invalid)])
if grading_style == "long":
record.writerow(["Mean score", "{}".format(statistics.mean(aggregate_scores)),
"Variance", str(statistics.variance(aggregate_scores))])
print("""
Mean score: {}
Variance: {}
""".format(statistics.mean(aggregate_scores), str(statistics.variance(aggregate_scores))))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog='ProgramName',
description='What the program does',
epilog='Text at the bottom of help')
parser.add_argument('-q','--questions', action='store', type=int, default=10)
parser.add_argument('-s','--style', action='store', default='all')
parser.add_argument('-e','--evaluation_style', action='store', default='long')
parser.add_argument('-d','--dataset', action='store', default="LongForm")
parser.add_argument('-f','--file_base', action='store', default="answers")
parser.add_argument('-m','--model', action='store', default="gpt-3.5-turbo")
parser.add_argument('-j','--judge_evals', action='store', type=int, default=3)
args = parser.parse_args()
utils.init()
utils.model_selection = args.model
all_styles = ["normal", "researcher", "persona", "roundtable"]
all_styles_long = ["normal_long", "researcher_long", "persona_long", "roundtable_long"]
no_roundtable = ["normal", "researcher", "persona", "roundtable"]
no_roundtable_long = ["normal_long", "researcher_long", "persona_long"]
question_limit = args.questions
if args.style == "judge_eval":
sample_answer = """
As ILC2s are elevated in patients with CRSwNP, they may drive nasal polyp formation in CRS.
ILC2s are also linked with high tissue and blood eosinophilia and have a potential role in the activation and survival of eosinophils during the Th2 immune response.
The association of innate lymphoid cells in CRS provides insights into its pathogenesis.
"""
sample_wrong_answer = """
As ILC2s are decreased in patients with CRSwNP, they may prevent nasal polyp formation in CRS.
ILC2s are also negatively linked with high tissue and blood eosinophilia and have a potential role in the deactivation of eosinophils during the Th2 immune response.
The association of innate lymphoid cells in CRS provides no insights into its pathogenesis.
"""
sample_right_answer = """
As ILC2s are increased in patients with CRSwNP, they may encourage nasal polyp formation in CRS.
ILC2s are also associated with high tissue and blood eosinophilia and have a potential role in the continuation of eosinophils during the Th2 immune response.
The association of innate lymphoid cells in CRS reveals insights into its pathogenesis.
"""
sample_question="Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?"
judge_wrong_answers = []
judge_right_answers = []
for i in range(question_limit):
judge_wrong_answers.append(int(llm_judge(sample_wrong_answer, sample_question, sample_answer)[-1]))
judge_right_answers.append(int(llm_judge(sample_right_answer, sample_question, sample_answer)[-1]))
print(judge_wrong_answers)
print(f"""variance_wrong = {statistics.variance(judge_wrong_answers)}
mean_wrong = {statistics.mean(judge_wrong_answers)}""")
print(judge_right_answers)
print(f"""variance_right = {statistics.variance(judge_right_answers)}
mean_right = {statistics.mean(judge_right_answers)}""")
sys.exit()
if args.evaluation_style =="mc":
if args.style.lower() == "all":
styles = all_styles
elif args.style.lower() == "no_roundtable":
styles = no_roundtable
else:
styles = [args.style.lower()]
for style in styles:
if args.style.lower == "all" and style == 'roundtable':
takeTest(style=style, question_limit = question_limit//2, output_file = "output/" + args.file_base, dataset_name=args.dataset, judge_evaluations = args.judge_evals)
takeTest(style=style, question_limit = question_limit, output_file = "output/" + args.file_base, dataset_name=args.dataset, judge_evaluations = args.judge_evals)
else:
if args.style.lower() == "all":
styles = all_styles_long
elif args.style.lower() == "no_roundtable":
styles = no_roundtable_long
else:
styles = [args.style.lower() + "_long"]
for style in styles:
if args.style.lower == "all" and style == 'roundtable':
takeTest(style=style, question_limit = question_limit//2, output_file = "output/" + args.file_base, dataset_name=args.dataset, judge_evaluations = args.judge_evals)
takeTest(style=style, question_limit = question_limit, output_file = "output/" + args.file_base, dataset_name=args.dataset, judge_evaluations = args.judge_evals)