diff --git a/nemo_skills/dataset/open-proof-corpus-judge/__init__.py b/nemo_skills/dataset/open-proof-corpus-judge/__init__.py index 6cc4242cf2..881b046be2 100644 --- a/nemo_skills/dataset/open-proof-corpus-judge/__init__.py +++ b/nemo_skills/dataset/open-proof-corpus-judge/__init__.py @@ -15,3 +15,6 @@ DATASET_GROUP = "math" METRICS_TYPE = "answer-judgement" GENERATION_ARGS = "++prompt_config=judge/math-proof-judge ++generation_key=judgement" + +# This is a judge-only dataset (judges proofs directly) +JUDGE_PIPELINE_ARGS = {} diff --git a/nemo_skills/dataset/proof-arena-judge/__init__.py b/nemo_skills/dataset/proof-arena-judge/__init__.py index 6cc4242cf2..881b046be2 100644 --- a/nemo_skills/dataset/proof-arena-judge/__init__.py +++ b/nemo_skills/dataset/proof-arena-judge/__init__.py @@ -15,3 +15,6 @@ DATASET_GROUP = "math" METRICS_TYPE = "answer-judgement" GENERATION_ARGS = "++prompt_config=judge/math-proof-judge ++generation_key=judgement" + +# This is a judge-only dataset (judges proofs directly) +JUDGE_PIPELINE_ARGS = {} diff --git a/nemo_skills/dataset/proof-bench-judge/__init__.py b/nemo_skills/dataset/proof-bench-judge/__init__.py index 6cc4242cf2..881b046be2 100644 --- a/nemo_skills/dataset/proof-bench-judge/__init__.py +++ b/nemo_skills/dataset/proof-bench-judge/__init__.py @@ -15,3 +15,6 @@ DATASET_GROUP = "math" METRICS_TYPE = "answer-judgement" GENERATION_ARGS = "++prompt_config=judge/math-proof-judge ++generation_key=judgement" + +# This is a judge-only dataset (judges proofs directly) +JUDGE_PIPELINE_ARGS = {} diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py index fbe31ee425..36f69d9131 100644 --- a/tests/gpu-tests/test_eval.py +++ b/tests/gpu-tests/test_eval.py @@ -233,12 +233,14 @@ def test_prepare_and_eval_all_datasets(): "ioi24", "ioi25", "bfcl_v3", + "bfcl_v4", "swe-bench", "aai", "human-eval", "human-eval-infilling", "mbpp", "mmau-pro", + "aalcr", # Has tokenization mismatch issues } dataset_names = sorted( @@ -252,7 +254,8 @@ def test_prepare_and_eval_all_datasets(): judge_datasets = [] for dataset in dataset_names: dataset_module = import_module(f"nemo_skills.dataset.{dataset}") - if getattr(dataset_module, "JUDGE_PIPELINE_ARGS", None): + # Check if JUDGE_PIPELINE_ARGS exists (even if empty dict, which is falsy) + if hasattr(dataset_module, "JUDGE_PIPELINE_ARGS"): judge_datasets.append(dataset) non_judge_datasets = [dataset for dataset in dataset_names if dataset not in judge_datasets]