NVIDIA-NeMo · hsiehjackson · Oct 24, 2025 · Oct 15, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/nemo_skills/dataset/aalcr/prepare.py b/nemo_skills/dataset/aalcr/prepare.py
@@ -191,6 +191,7 @@ def write_data_to_file(output_file, data, txt_file_folder, max_context_window, t
 
             entry[f"n_tokens_{tokenizer_name}"] = n_tokens
             entry["question"] = question
+            entry["original_question"] = question_text
             entry["expected_answer"] = entry.pop("answer")
             entry["expected_judgement"] = "correct"  # for judgement metric
             # remove unused columns

diff --git a/nemo_skills/evaluation/metrics/aalcr_metrics.py b/nemo_skills/evaluation/metrics/aalcr_metrics.py
@@ -61,10 +61,9 @@ def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
 
         # Primary evaluation method: LLM-based equality checker
         if "judgement" in prediction:
-            is_valid_generation = len(prediction["generation"].strip()) > 0
-            correctness_dict["judge_correct"] = (
-                self.is_aalcr_correct(prediction["judgement"]) if is_valid_generation else False
-            )
+             # Invalid generation: reasoning is not finished or non-reasoning generation is empty
+            correctness_dict["generation_valid"] = len(prediction["generation"].strip()) > 0
+            correctness_dict["judge_correct"] = self.is_aalcr_correct(prediction["judgement"]) if correctness_dict["generation_valid"] else False
 
         return correctness_dict
 

diff --git a/nemo_skills/prompt/config/judge/aalcr.yaml b/nemo_skills/prompt/config/judge/aalcr.yaml
@@ -6,7 +6,7 @@
 user: |-
     Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
 
-    The question, for reference only: {question}
+    The question, for reference only: {original_question}
     The OFFICIAL ANSWER: {expected_answer}
     CANDIDATE ANSWER TO ASSESS: {generation}