@@ -370,6 +370,38 @@ class Metrics(Enum):
370
370
corpus_level_fn = np .mean ,
371
371
higher_is_better = True ,
372
372
)
373
+ math_pass_at_1_1n = SampleLevelMetric (
374
+ metric_name = "math_pass@1:1_samples" ,
375
+ sample_level_fn = PassAtK (
376
+ k = 1 ,
377
+ n = 1 ,
378
+ strip_strings = True ,
379
+ # Extracting mathematical expressions and latex expressions
380
+ normalize_gold = lambda k : extract_target_from_pred (
381
+ k ,
382
+ get_extraction_regexes (
383
+ formatted_doc = None ,
384
+ target_types = [ExprExtractionConfig (), LatexExtractionConfig ()],
385
+ language = Language .ENGLISH ,
386
+ ),
387
+ ),
388
+ # Extracting mathematical expressions and latex expressions
389
+ normalize_pred = lambda k : extract_target_from_pred (
390
+ k ,
391
+ get_extraction_regexes (
392
+ formatted_doc = None ,
393
+ target_types = [ExprExtractionConfig (), LatexExtractionConfig ()],
394
+ language = Language .ENGLISH ,
395
+ ),
396
+ ),
397
+ # Uses sympy for comparision
398
+ sample_scoring_function = compare_gold_target ,
399
+ ).compute ,
400
+ category = MetricCategory .GENERATIVE_SAMPLING ,
401
+ use_case = MetricUseCase .REASONING ,
402
+ corpus_level_fn = np .mean ,
403
+ higher_is_better = True ,
404
+ )
373
405
math_pass_at_1_4n = SampleLevelMetric (
374
406
metric_name = "math_pass@1:4_samples" ,
375
407
sample_level_fn = PassAtK (
@@ -838,6 +870,57 @@ class Metrics(Enum):
838
870
pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
839
871
precision = 6 ,
840
872
)
873
+ gpqa_instruct_pass_at_1_1n = SampleLevelMetric (
874
+ metric_name = "gpqa_pass@1:1_samples" ,
875
+ sample_level_fn = PassAtK (
876
+ k = 1 ,
877
+ n = 1 ,
878
+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
879
+ language = Language .ENGLISH ,
880
+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
881
+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
882
+ precision = 6 ,
883
+ ).sample_level_fn ([ref ], [pred ], doc ),
884
+ ).compute ,
885
+ category = MetricCategory .GENERATIVE_SAMPLING ,
886
+ use_case = MetricUseCase .REASONING ,
887
+ corpus_level_fn = np .mean ,
888
+ higher_is_better = True ,
889
+ )
890
+ gpqa_instruct_pass_at_1_4n = SampleLevelMetric (
891
+ metric_name = "gpqa_pass@1:4_samples" ,
892
+ sample_level_fn = PassAtK (
893
+ k = 1 ,
894
+ n = 4 ,
895
+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
896
+ language = Language .ENGLISH ,
897
+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
898
+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
899
+ precision = 6 ,
900
+ ).sample_level_fn ([ref ], [pred ], doc ),
901
+ ).compute ,
902
+ category = MetricCategory .GENERATIVE_SAMPLING ,
903
+ use_case = MetricUseCase .REASONING ,
904
+ corpus_level_fn = np .mean ,
905
+ higher_is_better = True ,
906
+ )
907
+ gpqa_instruct_pass_at_1_8n = SampleLevelMetric (
908
+ metric_name = "gpqa_pass@1:8_samples" ,
909
+ sample_level_fn = PassAtK (
910
+ k = 1 ,
911
+ n = 8 ,
912
+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
913
+ language = Language .ENGLISH ,
914
+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
915
+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
916
+ precision = 6 ,
917
+ ).sample_level_fn ([ref ], [pred ], doc ),
918
+ ).compute ,
919
+ category = MetricCategory .GENERATIVE_SAMPLING ,
920
+ use_case = MetricUseCase .REASONING ,
921
+ corpus_level_fn = np .mean ,
922
+ higher_is_better = True ,
923
+ )
841
924
842
925
def __str__ (self ):
843
926
return self .name .replace ("_at_" , "@" )
0 commit comments