From dab9ba582ba1ece8db84c8b5d29dbbe4d926daf0 Mon Sep 17 00:00:00 2001
From: Stephen Ge <stepheng@nvidia.com>
Date: Mon, 15 Dec 2025 12:59:31 -0800
Subject: [PATCH 1/2] Fix prover.py compatibility with gpt-oss models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add _make_assistant_message helper to properly handle reasoning_content
from models like gpt-oss that output <|channel|> tags. The chat template
expects these in a 'thinking' field rather than embedded in 'content'.

This fix is backwards compatible - the 'thinking' field is only added
when reasoning_content is present in the generation output.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: Stephen Ge <stepheng@nvidia.com>
---
 nemo_skills/inference/prover.py | 35 ++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/nemo_skills/inference/prover.py b/nemo_skills/inference/prover.py
index 848c7c70dd..bea5e306bc 100644
--- a/nemo_skills/inference/prover.py
+++ b/nemo_skills/inference/prover.py
@@ -195,6 +195,17 @@ def _transform_for_nemotron_refinement(self, proof_attempt: str, error_message:
             }
         )
 
+    def _make_assistant_message(self, content: str, reasoning_content: str | None = None) -> dict:
+        """Create an assistant message dict, optionally with thinking/reasoning content.
+
+        Some models (e.g., gpt-oss) output <|channel|> tags that need to be in a separate
+        'thinking' field rather than in 'content' for the chat template to work correctly.
+        """
+        message = {"role": "assistant", "content": content}
+        if reasoning_content:
+            message["thinking"] = reasoning_content
+        return message
+
     async def _single_data_point_generate(self, data_point, data):
         formal_statement = (
             (data_point["header"].strip() + "\n")
@@ -243,8 +254,11 @@ async def _single_data_point_generate(self, data_point, data):
                 ),
             )
 
+            # Get reasoning_content if available (e.g., from gpt-oss models)
+            reasoning_content = generation.get("reasoning_content")
+
             new_prompt_turn_list = deepcopy(prompt_turn_list)
-            new_prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
+            new_prompt_turn_list.append(self._make_assistant_message(generation["generation"], reasoning_content))
 
             prompt_turn_list_list.append(
                 new_prompt_turn_list
@@ -259,22 +273,15 @@ async def _single_data_point_generate(self, data_point, data):
             ):  # check if successfully parse the code. We do not want to delete the turn if there is a parsing error.
                 if self.cfg.delete_wrong_turns:
                     prompt_turn_list = deepcopy(base_prompt_turn_list) + [
-                        {
-                            "role": "assistant",
-                            "content": f"```lean4\n{full_code.strip()}\n```",
-                        }
+                        self._make_assistant_message(f"```lean4\n{full_code.strip()}\n```")
                     ]  # only keep the latest turn
                 else:
-                    prompt_turn_list += [
-                        {
-                            "role": "assistant",
-                            "content": f"```lean4\n{full_code.strip()}\n```",
-                        }
-                    ]
-                full_prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
+                    prompt_turn_list.append(self._make_assistant_message(f"```lean4\n{full_code.strip()}\n```"))
+                full_prompt_turn_list.append(self._make_assistant_message(generation["generation"], reasoning_content))
             else:
-                prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
-                full_prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
+                assistant_msg = self._make_assistant_message(generation["generation"], reasoning_content)
+                prompt_turn_list.append(assistant_msg)
+                full_prompt_turn_list.append(assistant_msg)
 
             if code == "None" or "**Error**" in full_code:
                 if code == "None":

From 3c25ee17af30ffe7387c42bee84d0126b2921dd6 Mon Sep 17 00:00:00 2001
From: Stephen Ge <stepheng@nvidia.com>
Date: Mon, 15 Dec 2025 13:52:28 -0800
Subject: [PATCH 2/2] Fix gpt-oss channel tag parsing in prover.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gpt-oss model outputs <|channel|> tags embedded in the content field,
which causes apply_chat_template to fail. The chat template expects
analysis/thinking content in a separate 'thinking' field.

Added _parse_gpt_oss_output() to extract:
- Analysis content (between <|channel|>analysis<|message|> and <|end|>)
  -> goes to 'thinking' field
- Final content (after <|channel|>final<|message|>)
  -> goes to 'content' field

Updated _make_assistant_message() to automatically parse gpt-oss format
when reasoning_content is not provided by the server.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: Stephen Ge <stepheng@nvidia.com>
---
 nemo_skills/inference/prover.py | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/nemo_skills/inference/prover.py b/nemo_skills/inference/prover.py
index bea5e306bc..711b5fe9aa 100644
--- a/nemo_skills/inference/prover.py
+++ b/nemo_skills/inference/prover.py
@@ -195,12 +195,57 @@ def _transform_for_nemotron_refinement(self, proof_attempt: str, error_message:
             }
         )
 
+    def _parse_gpt_oss_output(self, content: str) -> tuple[str, str | None]:
+        """Parse gpt-oss model output to extract thinking and final content.
+
+        gpt-oss models output in the format:
+        <|channel|>analysis<|message|>...thinking...<|end|><|start|>assistant<|channel|>final<|message|>...final...<|return|>
+
+        The chat template expects analysis content in 'thinking' field and final content in 'content' field.
+
+        Returns:
+            tuple of (final_content, thinking_content or None)
+        """
+        import re
+
+        # Check if the content contains gpt-oss channel tags
+        if "<|channel|>" not in content:
+            return content, None
+
+        thinking = None
+        final_content = content
+
+        # Extract analysis/thinking content: between <|channel|>analysis<|message|> and <|end|>
+        analysis_pattern = r"<\|channel\|>analysis[^<]*<\|message\|>(.*?)<\|end\|>"
+        analysis_match = re.search(analysis_pattern, content, re.DOTALL)
+        if analysis_match:
+            thinking = analysis_match.group(1).strip()
+
+        # Extract final content: after <|channel|>final<|message|> until <|return|> or end
+        final_pattern = r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)"
+        final_match = re.search(final_pattern, content, re.DOTALL)
+        if final_match:
+            final_content = final_match.group(1).strip()
+        else:
+            # If no final channel found, try to strip all channel tags and use what remains
+            # This handles cases where the format might be slightly different
+            final_content = re.sub(r"<\|[^|]+\|>", "", content).strip()
+
+        return final_content, thinking
+
     def _make_assistant_message(self, content: str, reasoning_content: str | None = None) -> dict:
         """Create an assistant message dict, optionally with thinking/reasoning content.
 
         Some models (e.g., gpt-oss) output <|channel|> tags that need to be in a separate
         'thinking' field rather than in 'content' for the chat template to work correctly.
+
+        If reasoning_content is not provided, attempts to parse it from content if the content
+        contains gpt-oss channel tags.
         """
+        # If reasoning_content not provided, try to parse from content
+        if reasoning_content is None:
+            content, reasoning_content = self._parse_gpt_oss_output(content)
+
         message = {"role": "assistant", "content": content}
         if reasoning_content:
             message["thinking"] = reasoning_content