Skip to content

Commit 326b969

Browse files
committed
include std and confidence interval
1 parent cd050d4 commit 326b969

File tree

1 file changed

+38
-10
lines changed

1 file changed

+38
-10
lines changed

Diff for: lmms_eval/tasks/vcr_wiki/utils.py

+38-10
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import spacy
1010
from nltk.util import ngrams
1111
from spacy.cli import download
12-
12+
import numpy as np
1313
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
1414

1515
# Download the English and Chinese models
@@ -231,6 +231,7 @@ def vcr_zh_process_results(doc, results):
231231
for k in output.keys():
232232
output[k].append(
233233
{
234+
"question_id": doc["question_id"],
234235
"score": tmp[k],
235236
"pred_ngram": tmp["max_sim_string"],
236237
"gt_ngram": crossed_text[i],
@@ -240,26 +241,53 @@ def vcr_zh_process_results(doc, results):
240241
return output
241242

242243

244+
def bootstrap_std(data, n_bootstrap=1000, ci=0.95):
245+
"""
246+
Args:
247+
data: a list of values
248+
n_bootstrap: number of bootstrap samples
249+
ci: confidence interval
250+
Returns:
251+
a tuple of mean, lower bound, upper bound
252+
"""
253+
n = len(data)
254+
means = []
255+
for _ in range(n_bootstrap):
256+
sample = np.random.choice(data, n, replace=True)
257+
means.append(np.mean(sample))
258+
means = np.array(means)
259+
lower_bound = np.percentile(means, (1 - ci) / 2 * 100)
260+
upper_bound = np.percentile(means, (1 + ci) / 2 * 100)
261+
std = np.std(means)
262+
return std, lower_bound, upper_bound
263+
264+
243265
def vcr_aggregate_results(results, args):
244266
"""
245267
Args:
246268
results: List[List[Dict]], list of results returned by process_results
247269
Returns:
248270
A float value representing the final score of jaccard index or exact match
249271
"""
250-
scores = 0
251-
count = 0
252-
output_dict = {}
272+
scores = []
273+
output_dict_detail_result = {}
253274
for i in range(len(results)):
254275
for blank_id in range(len(results[i])):
255-
scores += results[i][blank_id]["score"]
256-
count += 1
257-
output_dict[str(i)] = results[i]
258-
276+
scores.append(results[i][blank_id]["score"])
277+
output_dict_detail_result[str(i)] = results[i]
278+
mean_score = np.mean(scores)
279+
std, lb, ub = bootstrap_std(scores, n_bootstrap=1000, ci=0.95)
280+
output_dict = {
281+
"mean_score": mean_score,
282+
"std_score": std,
283+
"percentile_2.5": lb,
284+
"percentie_97.5": ub,
285+
"detailed_results": output_dict_detail_result,
286+
}
259287
now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
260288
path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)
261-
with open(path, "w", encoding='utf-8') as f:
289+
with open(path, "w", encoding="utf-8") as f:
262290
json.dump(output_dict, f, indent=4, ensure_ascii=False)
263291
# print(f"Submission file saved to {path}")
264292
eval_logger.info(f"Submission file saved to {path}")
265-
return scores / count
293+
return mean_score

0 commit comments

Comments
 (0)