diff --git a/nemo_skills/evaluation/metrics/utils.py b/nemo_skills/evaluation/metrics/utils.py index d23f483a55..f5f804d5b7 100644 --- a/nemo_skills/evaluation/metrics/utils.py +++ b/nemo_skills/evaluation/metrics/utils.py @@ -13,6 +13,7 @@ import json import logging +import re from typing import Union from nemo_skills.utils import get_logger_name @@ -34,8 +35,10 @@ def read_predictions(predictions, line_idx, file_handles): def is_correct_judgement(judgement, return_none=False) -> Union[bool, None]: - if "Judgement:" in judgement: - verdict = judgement.split("Judgement:")[-1].strip() + # Match both plain "Judgement:" and markdown bold "**Judgement**:" formats, this happens for gpt-4o which is AA Judge model. + match = re.search(r"\*{0,2}Judgement\*{0,2}\s*:", judgement, re.IGNORECASE) + if match: + verdict = judgement[match.end() :].strip() if verdict.lower().startswith("yes"): return True elif verdict.lower().startswith("no"):