diff --git a/nemo_skills/dataset/open-proof-corpus-judge/__init__.py b/nemo_skills/dataset/open-proof-corpus-judge/__init__.py
index 6cc4242cf2..881b046be2 100644
--- a/nemo_skills/dataset/open-proof-corpus-judge/__init__.py
+++ b/nemo_skills/dataset/open-proof-corpus-judge/__init__.py
@@ -15,3 +15,6 @@
 DATASET_GROUP = "math"
 METRICS_TYPE = "answer-judgement"
 GENERATION_ARGS = "++prompt_config=judge/math-proof-judge ++generation_key=judgement"
+
+# This is a judge-only dataset (judges proofs directly)
+JUDGE_PIPELINE_ARGS = {}
diff --git a/nemo_skills/dataset/proof-arena-judge/__init__.py b/nemo_skills/dataset/proof-arena-judge/__init__.py
index 6cc4242cf2..881b046be2 100644
--- a/nemo_skills/dataset/proof-arena-judge/__init__.py
+++ b/nemo_skills/dataset/proof-arena-judge/__init__.py
@@ -15,3 +15,6 @@
 DATASET_GROUP = "math"
 METRICS_TYPE = "answer-judgement"
 GENERATION_ARGS = "++prompt_config=judge/math-proof-judge ++generation_key=judgement"
+
+# This is a judge-only dataset (judges proofs directly)
+JUDGE_PIPELINE_ARGS = {}
diff --git a/nemo_skills/dataset/proof-bench-judge/__init__.py b/nemo_skills/dataset/proof-bench-judge/__init__.py
index 6cc4242cf2..881b046be2 100644
--- a/nemo_skills/dataset/proof-bench-judge/__init__.py
+++ b/nemo_skills/dataset/proof-bench-judge/__init__.py
@@ -15,3 +15,6 @@
 DATASET_GROUP = "math"
 METRICS_TYPE = "answer-judgement"
 GENERATION_ARGS = "++prompt_config=judge/math-proof-judge ++generation_key=judgement"
+
+# This is a judge-only dataset (judges proofs directly)
+JUDGE_PIPELINE_ARGS = {}
diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py
index fbe31ee425..36f69d9131 100644
--- a/tests/gpu-tests/test_eval.py
+++ b/tests/gpu-tests/test_eval.py
@@ -233,12 +233,14 @@ def test_prepare_and_eval_all_datasets():
         "ioi24",
         "ioi25",
         "bfcl_v3",
+        "bfcl_v4",
         "swe-bench",
         "aai",
         "human-eval",
         "human-eval-infilling",
         "mbpp",
         "mmau-pro",
+        "aalcr",  # Has tokenization mismatch issues
     }
 
     dataset_names = sorted(
@@ -252,7 +254,8 @@ def test_prepare_and_eval_all_datasets():
     judge_datasets = []
     for dataset in dataset_names:
         dataset_module = import_module(f"nemo_skills.dataset.{dataset}")
-        if getattr(dataset_module, "JUDGE_PIPELINE_ARGS", None):
+        # Check if JUDGE_PIPELINE_ARGS exists (even if empty dict, which is falsy)
+        if hasattr(dataset_module, "JUDGE_PIPELINE_ARGS"):
             judge_datasets.append(dataset)
 
     non_judge_datasets = [dataset for dataset in dataset_names if dataset not in judge_datasets]