diff --git a/nemo_skills/dataset/aalcr/prepare.py b/nemo_skills/dataset/aalcr/prepare.py index 856d734639..ec9fa75256 100644 --- a/nemo_skills/dataset/aalcr/prepare.py +++ b/nemo_skills/dataset/aalcr/prepare.py @@ -191,6 +191,7 @@ def write_data_to_file(output_file, data, txt_file_folder, max_context_window, t entry[f"n_tokens_{tokenizer_name}"] = n_tokens entry["question"] = question + entry["original_question"] = question_text entry["expected_answer"] = entry.pop("answer") entry["expected_judgement"] = "correct" # for judgement metric # remove unused columns diff --git a/nemo_skills/evaluation/metrics/aalcr_metrics.py b/nemo_skills/evaluation/metrics/aalcr_metrics.py index c913554a37..56cc5d96e0 100644 --- a/nemo_skills/evaluation/metrics/aalcr_metrics.py +++ b/nemo_skills/evaluation/metrics/aalcr_metrics.py @@ -61,10 +61,9 @@ def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]: # Primary evaluation method: LLM-based equality checker if "judgement" in prediction: - is_valid_generation = len(prediction["generation"].strip()) > 0 - correctness_dict["judge_correct"] = ( - self.is_aalcr_correct(prediction["judgement"]) if is_valid_generation else False - ) + # Invalid generation: reasoning is not finished or non-reasoning generation is empty + correctness_dict["generation_valid"] = len(prediction["generation"].strip()) > 0 + correctness_dict["judge_correct"] = self.is_aalcr_correct(prediction["judgement"]) if correctness_dict["generation_valid"] else False return correctness_dict diff --git a/nemo_skills/prompt/config/judge/aalcr.yaml b/nemo_skills/prompt/config/judge/aalcr.yaml index 0405968be3..fc2ba785bf 100644 --- a/nemo_skills/prompt/config/judge/aalcr.yaml +++ b/nemo_skills/prompt/config/judge/aalcr.yaml @@ -6,7 +6,7 @@ user: |- Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER. - The question, for reference only: {question} + The question, for reference only: {original_question} The OFFICIAL ANSWER: {expected_answer} CANDIDATE ANSWER TO ASSESS: {generation}