9
9
import spacy
10
10
from nltk .util import ngrams
11
11
from spacy .cli import download
12
-
12
+ import numpy as np
13
13
from lmms_eval .tasks ._task_utils .file_utils import generate_submission_file
14
14
15
15
# Download the English and Chinese models
@@ -231,6 +231,7 @@ def vcr_zh_process_results(doc, results):
231
231
for k in output .keys ():
232
232
output [k ].append (
233
233
{
234
+ "question_id" : doc ["question_id" ],
234
235
"score" : tmp [k ],
235
236
"pred_ngram" : tmp ["max_sim_string" ],
236
237
"gt_ngram" : crossed_text [i ],
@@ -240,26 +241,53 @@ def vcr_zh_process_results(doc, results):
240
241
return output
241
242
242
243
244
+ def bootstrap_std (data , n_bootstrap = 1000 , ci = 0.95 ):
245
+ """
246
+ Args:
247
+ data: a list of values
248
+ n_bootstrap: number of bootstrap samples
249
+ ci: confidence interval
250
+ Returns:
251
+ a tuple of mean, lower bound, upper bound
252
+ """
253
+ n = len (data )
254
+ means = []
255
+ for _ in range (n_bootstrap ):
256
+ sample = np .random .choice (data , n , replace = True )
257
+ means .append (np .mean (sample ))
258
+ means = np .array (means )
259
+ lower_bound = np .percentile (means , (1 - ci ) / 2 * 100 )
260
+ upper_bound = np .percentile (means , (1 + ci ) / 2 * 100 )
261
+ std = np .std (means )
262
+ return std , lower_bound , upper_bound
263
+
264
+
243
265
def vcr_aggregate_results (results , args ):
244
266
"""
245
267
Args:
246
268
results: List[List[Dict]], list of results returned by process_results
247
269
Returns:
248
270
A float value representing the final score of jaccard index or exact match
249
271
"""
250
- scores = 0
251
- count = 0
252
- output_dict = {}
272
+ scores = []
273
+ output_dict_detail_result = {}
253
274
for i in range (len (results )):
254
275
for blank_id in range (len (results [i ])):
255
- scores += results [i ][blank_id ]["score" ]
256
- count += 1
257
- output_dict [str (i )] = results [i ]
258
-
276
+ scores .append (results [i ][blank_id ]["score" ])
277
+ output_dict_detail_result [str (i )] = results [i ]
278
+ mean_score = np .mean (scores )
279
+ std , lb , ub = bootstrap_std (scores , n_bootstrap = 1000 , ci = 0.95 )
280
+ output_dict = {
281
+ "mean_score" : mean_score ,
282
+ "std_score" : std ,
283
+ "percentile_2.5" : lb ,
284
+ "percentie_97.5" : ub ,
285
+ "detailed_results" : output_dict_detail_result ,
286
+ }
259
287
now_date_time = datetime .datetime .now ().strftime ("%Y-%m-%d-%H-%M-%S" )
260
288
path = generate_submission_file (f"vcr_submission_{ now_date_time } .json" , args )
261
- with open (path , "w" , encoding = ' utf-8' ) as f :
289
+ with open (path , "w" , encoding = " utf-8" ) as f :
262
290
json .dump (output_dict , f , indent = 4 , ensure_ascii = False )
263
291
# print(f"Submission file saved to {path}")
264
292
eval_logger .info (f"Submission file saved to { path } " )
265
- return scores / count
293
+ return mean_score
0 commit comments