diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py index 24d31a434..ceea8971a 100644 --- a/src/open_r1/grpo.py +++ b/src/open_r1/grpo.py @@ -67,8 +67,11 @@ def accuracy_reward(completions, solution, **kwargs): ], extraction_mode="first_match", ) - # Reward 1 if the content is the same as the ground truth, 0 otherwise - reward = float(verify(answer_parsed, gold_parsed)) + try: + # Reward 1 if the content is the same as the ground truth, 0 otherwise + reward = float(verify(answer_parsed, gold_parsed)) + except Exception as e: + reward = 0 else: # If the gold solution is not parseable, we reward 1 to skip this example reward = 1.0