-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheron_bench_scorer.py
193 lines (177 loc) · 14.7 KB
/
heron_bench_scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from eval_mm.utils.azure_client import OpenAIChatAPI
from collections import defaultdict
import numpy as np
RULES: dict = {
"coding": {
"role": "Assistant",
"prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line.",
},
"math": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better.",
},
"default": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment. Sentences are given in Japanese, and language is not relevant to score.",
},
"conv": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment. Sentences are given in Japanese, and language is not relevant to score.",
},
"detail": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment. Sentences are given in Japanese, and language is not relevant to score.",
},
"complex": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment. Sentences are given in Japanese, and language is not relevant to score.",
},
"llava_bench_conv": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.",
},
"llava_bench_detail": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.",
},
"llava_bench_complex": {
"role": "Assistant",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.",
},
}
def parse_score(review: str) -> dict[str, int]:
try:
score_pair = review.split("\n")[0]
score_pair = score_pair.replace(",", " ")
sp = score_pair.split(" ")
if len(sp) == 2:
return {"score": int(sp[1]), "score_gpt": int(sp[0])}
else:
print("error", review)
return {"score": -1, "score_gpt": -1}
except Exception as e:
print(e)
print("error", review)
return {"score": -1, "score_gpt": -1}
def ask_gpt4_batch(
content_list: str, max_tokens: int, async_client: OpenAIChatAPI, model_name: str
) -> list:
message_list = [
[
{
"role": "system",
"content": "You are a helpful and precise assistant for checking the quality of the answer.",
},
{"role": "user", "content": content},
]
for content in content_list
]
completions = async_client.batch_generate_chat_response(
message_list,
max_tokens=max_tokens,
temperature=0,
seed=0,
model_name=model_name,
)
return completions
def build_content(context, input_text, ref_answer, pred_answer, role, prompt):
return (
f"[Context]\n{context}\n\n"
f"[Question]\n{input_text}\n\n"
f"[{role} 1]\n{ref_answer}\n\n[End of {role} 1]\n\n"
f"[{role} 2]\n{pred_answer}\n\n[End of {role} 2]\n\n"
f"[System]\n{prompt}\n\n"
f"If it is not relevant to the context, does not answer directly, or says the wrong thing, give it a low score.\n\n"
)
class HeronBenchScorer:
@staticmethod
def score(refs, preds: list[str], **kwargs) -> list[dict[str, int]]:
docs = kwargs["docs"]
client = kwargs["client"]
judge_model = kwargs["judge_model"]
contents = []
for doc, ref, pred in zip(docs, refs, preds):
content = build_content(
doc["context"],
doc["input_text"],
ref,
pred,
"Assistant",
RULES[doc["category"]]["prompt"],
)
contents.append(content)
completions = ask_gpt4_batch(contents, 1024, client, judge_model)
scores = [parse_score(completion) for completion in completions]
assert len(scores) == len(docs)
return scores
@staticmethod
def aggregate(scores: list[dict[str, int]], **kwargs) -> dict[str, float]:
docs = kwargs["docs"]
category_list = ["conv", "detail", "complex"]
heron_metrics = defaultdict(float)
for category in category_list:
score_owns = [
score["score"]
for score, doc in zip(scores, docs)
if doc["category"] == category
]
score_gpts = [
score["score_gpt"]
for score, doc in zip(scores, docs)
if doc["category"] == category
]
if len(score_owns) == 0 or np.mean(score_owns) == -1:
continue
avg_score = np.mean(score_owns)
avs_score_rel = (
100
* np.mean(score_owns)
/ max(
0.01, np.mean(score_gpts)
) # divide by 0.01 when 0 division happens
)
heron_metrics[category] = avg_score
heron_metrics[category + "_rel"] = avs_score_rel
heron_metrics["parse_error_count"] = sum(
score["score"] == -1 for score in scores
)
heron_metrics["overall"] = sum([score["score"] for score in scores]) / len(
scores
)
heron_metrics["overall_rel"] = sum(
[heron_metrics[category + "_rel"] for category in category_list]
) / len(category_list)
return heron_metrics
def test_heron_bench_scorer():
from datasets import load_dataset
docs = load_dataset("Silviase/Japanese-Heron-Bench", split="train").select(range(1))
docs = docs.rename_column("text", "input_text")
pred_texts = ["This is a test."]
client = OpenAIChatAPI()
judge_model = "gpt-4o-mini-2024-07-18"
refs = [doc["answer"]["gpt-4-0125-preview"] for doc in docs]
scores = HeronBenchScorer.score(
refs, pred_texts, docs=docs, client=client, judge_model=judge_model
)
assert len(scores) == 1
print(scores)
calculated_metric = HeronBenchScorer.aggregate(scores, docs=docs)
assert "overall" in calculated_metric
print(calculated_metric)
if __name__ == "__main__":
from datasets import load_dataset
ds = load_dataset("Silviase/Japanese-Heron-Bench", split="train")
ds = ds.rename_column("text", "input_text")
# the 102th problem
ds = ds.select(range(100, 102))
pred_texts = [
"画像から判断すると、制限速度は50 km/hのようです。道路標識に「50」と表示されています。",
"画像に写っている標識によると、現在地からニセコまでは12kmです。",
]
refs = [doc["answer"]["gpt-4-0125-preview"] for doc in ds]
client = OpenAIChatAPI()
judge_model = "gpt-4o-2024-05-13"
scores = HeronBenchScorer.score(
refs, pred_texts, docs=ds, client=client, judge_model=judge_model
)
print(scores)