Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nemo_skills/dataset/aalcr/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ def write_data_to_file(output_file, data, txt_file_folder, max_context_window, t

entry[f"n_tokens_{tokenizer_name}"] = n_tokens
entry["question"] = question
entry["original_question"] = question_text
entry["expected_answer"] = entry.pop("answer")
entry["expected_judgement"] = "correct" # for judgement metric
# remove unused columns
Expand Down
7 changes: 3 additions & 4 deletions nemo_skills/evaluation/metrics/aalcr_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,9 @@ def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:

# Primary evaluation method: LLM-based equality checker
if "judgement" in prediction:
is_valid_generation = len(prediction["generation"].strip()) > 0
correctness_dict["judge_correct"] = (
self.is_aalcr_correct(prediction["judgement"]) if is_valid_generation else False
)
# Invalid generation: reasoning is not finished or non-reasoning generation is empty
correctness_dict["generation_valid"] = len(prediction["generation"].strip()) > 0
correctness_dict["judge_correct"] = self.is_aalcr_correct(prediction["judgement"]) if correctness_dict["generation_valid"] else False

return correctness_dict

Expand Down
2 changes: 1 addition & 1 deletion nemo_skills/prompt/config/judge/aalcr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
user: |-
Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.

The question, for reference only: {question}
The question, for reference only: {original_question}
The OFFICIAL ANSWER: {expected_answer}
CANDIDATE ANSWER TO ASSESS: {generation}

Expand Down
Loading