diff --git a/nemo_skills/code_execution/proof_utils.py b/nemo_skills/code_execution/proof_utils.py
index fdf84a87cf..33c92891db 100644
--- a/nemo_skills/code_execution/proof_utils.py
+++ b/nemo_skills/code_execution/proof_utils.py
@@ -14,12 +14,16 @@
 
 """Shared utilities for proof processing and evaluation."""
 
+import logging
 import re
 from dataclasses import dataclass
 from typing import Any, Dict
 
 from nemo_skills.code_execution.utils import clean_formal_generation
 from nemo_skills.dataset.utils import get_lean4_header
+from nemo_skills.utils import get_logger_name
+
+LOG = logging.getLogger(get_logger_name(__file__))
 
 
 @dataclass
@@ -192,3 +196,183 @@ def prepare_predicted_proof_from_line_dict(
     return build_lean4_proof(
         generation=line_dict["generation"], data_point=line_dict, config=config, answer_format=answer_format
     )
+
+
+# ------------------------------------------------------------------------------------------------
+# The following code is adapted from https://github.com/Goedel-LM/Goedel-Prover-V2
+# Used for multi-turn proof refinement in the lean4_prover workflow
+# ------------------------------------------------------------------------------------------------
+
+
+def remove_comments(text):
+    # First remove all /- ... -/ blocks
+    text = re.sub(r"/-.*?-/", "", text, flags=re.DOTALL)
+    # Then remove -- comments from each line
+    lines = text.split("\n")
+    cleaned_lines = []
+    for line in lines:
+        cleaned_line = line.split("--", 1)[0]
+        if cleaned_line.strip() == "":
+            continue
+        cleaned_lines.append(cleaned_line)
+    # Join back together and remove excessive empty lines
+    cleaned_text = "\n".join(cleaned_lines)
+    return cleaned_text.strip()
+
+
+def move_imports_to_beginning(input_string):
+    lines = input_string.split("\n")
+    import_lines = [line for line in lines if line.startswith("import")]
+    other_lines = [line for line in lines if not line.startswith("import")]
+    return "\n".join(import_lines + other_lines)
+
+
+def return_theorem_to_prove(text):
+    # Pattern that matches from 'theorem' or 'lemma' to ':= by sorry' with any content in between
+    pattern = r"((?:theorem).*?:=\s*by\s*sorry)"
+    match = re.search(pattern, text, re.DOTALL)
+    return match.span() if match else None
+
+
+def return_theorem_to_replace(text):
+    # Pattern that matches from 'theorem' or 'lemma' to ':= by sorry' with any content in between
+    pattern = r"((?:^|\s)theorem\s+.*?:=\s*by)"
+    match = re.search(pattern, text, re.DOTALL)
+    return match.span() if match else None
+
+
+def replace_statement_in_proof(statement, proof):
+    if ("apply?" in proof) or ("exact?" in proof):
+        return "**Error**, 'apply?' or 'exact?' is used, which is not allowed."
+    stats_re = remove_comments(statement)
+    stats_span_ = return_theorem_to_prove(stats_re)
+    if stats_span_ is None:
+        error_app = "\n".join(["\n"] + ["-- " + x for x in statement.split("\n")])
+        return f"**Error**, can not find 'theorem' and ':= sorry' in {error_app}"
+    proof_str = remove_comments(proof)
+    span = return_theorem_to_replace(proof_str)
+    if span is None:
+        error_app = "\n".join(["\n"] + ["-- " + x for x in proof.split("\n")])
+        return f"**Error**, can not find 'theorem' and ':=' in {error_app}"
+    return stats_re[: stats_span_[1]].replace("sorry", "") + proof_str[span[1] :]
+
+
+def refine_by_sorry(text):
+    # Define the regular expression pattern
+    target_pattern = r":=\s*(?:by\s*)?(?:sorry\s*)?"
+    replacement = ":= by sorry"  # The new text we want to insert
+    # We construct the pattern with two capturing groups
+    # (group 1: the part from 'theorem' to just before our target)
+    # (group 2: the target pattern itself)
+    combined_pattern = r"(theorem.*?)(" + target_pattern + r")"
+    # Find the first match
+    match = re.search(combined_pattern, text, re.DOTALL)
+    if match:
+        # The part of the string BEFORE the target we want to replace
+        # We use match.start(2) which is the start of the second group (our target)
+        prefix = text[: match.start(2)]
+        # Concatenate the prefix with the replacement to get the final, truncated string
+        final_text = prefix + replacement
+    else:
+        final_text = text
+    return final_text
+
+
+def extract_code(inputs):
+    import_head = (
+        "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n"
+    )
+    pattern = r"```lean4\n(.*?)\n```"
+    matches = re.findall(pattern, inputs, re.DOTALL)
+    if matches:
+        return import_head + matches[-1]
+    pattern = r"```lean4\n(.*?)```"
+    matches = re.findall(pattern, inputs, re.DOTALL)
+    if matches:
+        return import_head + matches[-1]
+    pattern = r"```lean\n(.*?)```"
+    matches = re.findall(pattern, inputs, re.DOTALL)
+    if matches:
+        return import_head + matches[-1]
+    return "None"
+
+
+def parse_error(log_string):
+    """Parse Lean4 compiler error messages from log output."""
+    error_pattern = re.compile(
+        r"(/lean4/my_project/.*?:\d+:\d+: error:.*?)(?=\n/lean4/my_project|\Z)",
+        re.DOTALL,
+    )
+    errors = error_pattern.findall(log_string)
+    pattern = re.compile(r":(\d+):(\d+):")
+    error_list = []
+    for error in errors:
+        match = pattern.search(error)
+        error_list.append(
+            {
+                "pos": {"line": int(match.group(1)), "column": int(match.group(2))},
+                "endPos": None,
+                "data": error.split("error:")[1],
+            }
+        )
+
+    return error_list
+
+
+def get_error_str(code, errors, error_thres=True):
+    """Format compiler errors with code context for display."""
+    err_str = ""
+    code_lines = code.split("\n")
+    error_num_thres = 8 if error_thres else len(errors)
+
+    for i, error in enumerate(errors[:error_num_thres]):
+        start_line = error["pos"]["line"] - 1
+        start_col = error["pos"]["column"]
+        if start_line >= len(code_lines):
+            LOG.warning(
+                "Error line %d exceeds code length %d. Errors: %s, Code: %s", start_line, len(code_lines), errors, code
+            )
+            continue
+        if error["endPos"] is None:
+            end_line = start_line
+            end_col = len(code_lines[start_line])
+        else:
+            end_line = error["endPos"]["line"] - 1
+            end_col = error["endPos"]["column"]
+
+        err_str += f"\nError {i + 1}:\n"
+        err_str += "\nCorresponding Code:\n```lean4\n"
+        error_code = ""
+        for ii in range(-4, 0):
+            if start_line + ii >= 0:
+                error_code += f"{code_lines[start_line + ii]}\n"
+        if start_line != end_line:
+            error_code += code_lines[start_line][:start_col] + "<error>" + code_lines[start_line][start_col:] + "\n"
+            if not error_thres:
+                for j in range(start_line + 1, end_line):
+                    error_code += f"{code_lines[j]}\n"
+            else:
+                show_line = 6
+                for j in range(start_line + 1, min(end_line, start_line + show_line)):
+                    error_code += f"{code_lines[j]}\n"
+                if end_line > start_line + show_line:
+                    leading_spaces = len(code_lines[j]) - len(code_lines[j].lstrip(" "))
+                    error_code += "\n" + " " * leading_spaces + "... --[Truncated]-- ...\n"
+            error_code += code_lines[end_line][:end_col] + "</error>" + code_lines[end_line][end_col:] + "\n"
+        else:
+            error_code += (
+                code_lines[start_line][:start_col]
+                + "<error>"
+                + code_lines[start_line][start_col:end_col]
+                + "</error>"
+                + code_lines[start_line][end_col:]
+                + "\n"
+            )
+        if end_line + 1 < len(code_lines):
+            error_code += f"{code_lines[end_line + 1]}\n"
+        err_str += error_code
+        err_str += "\n```\n"
+        err_str += f"\nError Message: {error['data']}\n"
+    if len(errors) > error_num_thres:
+        err_str += f"\n... [Omitted {len(errors) - error_num_thres} more errors] ...\n"
+    return err_str
diff --git a/nemo_skills/inference/prover.py b/nemo_skills/inference/prover.py
new file mode 100644
index 0000000000..848c7c70dd
--- /dev/null
+++ b/nemo_skills/inference/prover.py
@@ -0,0 +1,426 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import re
+import sys
+from copy import deepcopy
+from dataclasses import asdict, is_dataclass
+
+import hydra
+from transformers import AutoTokenizer
+
+from nemo_skills.code_execution.proof_utils import (
+    extract_code,
+    get_error_str,
+    parse_error,
+    refine_by_sorry,
+    replace_statement_in_proof,
+)
+from nemo_skills.code_execution.sandbox import sandbox_params
+from nemo_skills.inference.model import server_params
+from nemo_skills.inference.model.base import EndpointType
+from nemo_skills.prompt.utils import get_prompt
+from nemo_skills.utils import (
+    get_help_message,
+    get_logger_name,
+    nested_dataclass,
+    parse_reasoning,
+    setup_logging,
+)
+
+from .generate import GenerateSolutionsConfig, GenerationTask
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+reasoning_effort_list = [
+    "low",
+    "medium",
+    "high",
+]  # This is only used for adaptive reasoning with gpt-oss models
+
+
+@nested_dataclass(kw_only=True)
+class ProverConfig(GenerateSolutionsConfig):
+    max_tokens: int = 40960  # model max tokens
+    n_pass: int = 1  # number of passes to run the prover
+
+    # Lean 4 specific parameters
+    nemotron_refinement: bool = False  # whether to use single-turn nemotron-style refinement
+    refinement: bool = False  # whether to refine the code
+    refinement_max_turns: int = 2  # maximum number of turns for refinement
+    refinement_prompt_config: str | None = None  # prompt for multi-turn refinement feedback
+    # prompt for single-turn nemotron refinement (used when nemotron_refinement=True)
+    nemotron_refinement_prompt_config: str | None = None
+    adaptive_reasoning: bool = False  # whether to adapt the reasoning effort
+    parse_generation: bool = False  # whether to parse the generation
+    remove_cot: bool = False  # whether to remove the cot from the generation
+    # whether to delete the wrong turns from the generation
+    delete_wrong_turns: bool = False
+
+    def _post_init_validate_params(self):
+        """Validate that certain parameters are restricted to certain values"""
+        if self.prompt_format == "openai":
+            raise ValueError(
+                "prompt_format='openai' is not supported for lean4_prover. Use prompt_format='ns' with a prompt_config."
+            )
+        if self.prompt_format != "ns":
+            raise ValueError(f"prompt_format must be 'ns', got '{self.prompt_format}'")
+
+        assert self.prompt_config is not None, "prompt_config is required for lean4_prover"
+
+        for param, default_value in self._get_disallowed_params():
+            if getattr(self, param) != default_value:
+                raise ValueError(f"{param} must be {default_value}")
+
+        if self.n_pass > 32:
+            LOG.warning(
+                "n_pass=%d exceeds recommended maximum of 32. Consider using num_random_seeds instead.", self.n_pass
+            )
+
+
+cs = hydra.core.config_store.ConfigStore.instance()
+cs.store(name="base_prover_config", node=ProverConfig)
+
+
+class ProverTask(GenerationTask):
+    def __init__(self, cfg: ProverConfig):
+        """
+        Class that represents a generation task. It implements a template of steps to generate solutions using LLMs.
+        Individual functions can be overriden to customize the behavior of the generation task.
+
+        Args:
+            cfg: GenerateSolutionsConfig object with the configuration parameters or subclass.
+        """
+        super().__init__(cfg)
+
+        # Initialize tokenizer for chat template application
+        tokenizer_path = self.cfg.tokenizer or self.cfg.server.get("model")
+        self.hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+        if self.cfg.refinement:
+            self.setup_refine_prompt()
+
+        if self.cfg.delete_wrong_turns:
+            assert self.cfg.remove_cot, "remove_cot is required when delete_wrong_turns is enabled"
+
+    def log_example_prompt(self, data):
+        return
+
+    def setup_llm(self):
+        if self.cfg.code_execution:
+            raise ValueError("Code execution is not supported for prover. Use sandbox config for Lean4 execution.")
+        return super().setup_llm()
+
+    def setup_refine_prompt(self):
+        assert self.cfg.refinement_prompt_config is not None, (
+            "refinement_prompt_config is required when refinement is enabled. Please set refinement=False to disable refinement."
+        )
+        self.refine_prompt = get_prompt(self.cfg.refinement_prompt_config)
+
+        if self.cfg.nemotron_refinement:
+            assert self.cfg.nemotron_refinement_prompt_config is not None, (
+                "nemotron_refinement_prompt_config is required when nemotron_refinement is enabled."
+            )
+            self.nemotron_refine_prompt = get_prompt(self.cfg.nemotron_refinement_prompt_config)
+
+    async def _generate_single_completion(self, prompt: str, **kwargs):
+        """Generate a single completion with semaphore-controlled concurrency."""
+        if is_dataclass(self.cfg.inference):
+            inference_params = asdict(self.cfg.inference)
+        else:
+            # Already a dict from Hydra
+            inference_params = dict(self.cfg.inference)
+        generation_params = {
+            "prompt": prompt,
+            "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None,
+            **inference_params,
+            **self.extra_generate_params,
+        }
+        # Override endpoint_type to text since we already applied the chat template
+        generation_params["endpoint_type"] = EndpointType.text
+        for key, value in kwargs.items():
+            generation_params[key] = value
+
+        # Use semaphore for concurrency control (inherited from GenerationTask)
+        async with self.semaphore:
+            generation = await self.llm.generate_async(**generation_params)
+            if self.cfg.adaptive_reasoning:
+                assert generation_params["extra_body"].get("reasoning_effort", None) is not None, (
+                    "reasoning_effort is required when adaptive_reasoning is enabled"
+                )
+                reasoning_effort_index = reasoning_effort_list.index(
+                    generation_params["extra_body"].get("reasoning_effort", None)
+                )
+                while len(generation["generation"]) == 0 and reasoning_effort_index > 0:
+                    LOG.info(
+                        "Reasoning effort is too high, reducing to %s",
+                        reasoning_effort_list[reasoning_effort_index - 1],
+                    )
+                    reasoning_effort_index = reasoning_effort_index - 1
+                    generation_params["extra_body"]["reasoning_effort"] = reasoning_effort_list[reasoning_effort_index]
+                    generation = await self.llm.generate_async(**generation_params)
+
+        if self.cfg.parse_generation:
+            parse_reasoning(
+                generation,
+                self.cfg.generation_key,
+                self.cfg.end_reasoning_string,
+            )
+        return generation
+
+    # factor out this part so it won't become a bottleneck.
+    async def _extract_and_replace_code(self, formal_statement, generation):
+        code = extract_code(generation)
+        full_code = replace_statement_in_proof(formal_statement, code)
+        return code, full_code
+
+    def _transform_for_nemotron_refinement(self, proof_attempt: str, error_message: str) -> list[dict]:
+        """Transform multi-turn refinement into single-turn nemotron-style prompt."""
+        return self.nemotron_refine_prompt.fill(
+            {
+                "proof_attempt": proof_attempt,
+                "error_message": error_message,
+            }
+        )
+
+    async def _single_data_point_generate(self, data_point, data):
+        formal_statement = (
+            (data_point["header"].strip() + "\n")
+            + data_point["informal_prefix"].strip()
+            + ("\n" + data_point["formal_statement"].strip())
+        )
+        formal_statement = refine_by_sorry(formal_statement)
+        prompt_turn_list = self.prompt.fill({"problem": formal_statement.strip()})
+
+        full_prompt_turn_list = deepcopy(
+            prompt_turn_list
+        )  # We need to get a full copy of the prompt turn list for the final result in case remove_cot is enabled. This is only used to generate SFT data.
+        prompt_turn_list_list = []  # We need to store the prompt turn list for each turn for the final result in case delete_wrong_turns is enabled. This is only used to generate SFT data.
+        base_prompt_turn_list = deepcopy(prompt_turn_list)
+
+        code_list = []
+        results_dict_list = []
+        assert isinstance(prompt_turn_list, list), "prompt_turn_list should be a list"
+
+        success = False
+        turn_idx = 0
+        last_proof_attempt = None  # Track for nemotron refinement
+        last_error_message = None  # Track for nemotron refinement
+        for turn_idx in range(self.cfg.refinement_max_turns):
+            results_dict = {}  # everything will be stored in this dict
+            if turn_idx != 0 and self.cfg.nemotron_refinement and last_proof_attempt and last_error_message:
+                prepared_conversation = self._transform_for_nemotron_refinement(last_proof_attempt, last_error_message)
+            else:
+                prepared_conversation = prompt_turn_list
+            prefix_tokens = self.hf_tokenizer.apply_chat_template(
+                prepared_conversation, tokenize=True, add_generation_prompt=True
+            )
+            num_tokens_prefix = len(prefix_tokens)
+            prefix = self.hf_tokenizer.apply_chat_template(
+                prepared_conversation, tokenize=False, add_generation_prompt=True
+            )
+            # We need to check if the prefix is too long, if it is, we need to break the loop
+            if num_tokens_prefix > self.cfg.max_tokens:
+                break
+
+            generation = await self._generate_single_completion(
+                prefix,
+                tokens_to_generate=min(
+                    self.cfg.max_tokens - num_tokens_prefix,
+                    self.cfg.inference.tokens_to_generate,
+                ),
+            )
+
+            new_prompt_turn_list = deepcopy(prompt_turn_list)
+            new_prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
+
+            prompt_turn_list_list.append(
+                new_prompt_turn_list
+            )  # This stores the latest turn list after each generation.
+
+            code, full_code = await self._extract_and_replace_code(formal_statement, generation["generation"])
+            last_proof_attempt = generation["generation"]  # Track for nemotron refinement
+            code_list.append(full_code)
+            results_dict["code"] = code  # We keep track of the uncleaned code.
+            if self.cfg.remove_cot and not (
+                code == "None" or "**Error**" in full_code
+            ):  # check if successfully parse the code. We do not want to delete the turn if there is a parsing error.
+                if self.cfg.delete_wrong_turns:
+                    prompt_turn_list = deepcopy(base_prompt_turn_list) + [
+                        {
+                            "role": "assistant",
+                            "content": f"```lean4\n{full_code.strip()}\n```",
+                        }
+                    ]  # only keep the latest turn
+                else:
+                    prompt_turn_list += [
+                        {
+                            "role": "assistant",
+                            "content": f"```lean4\n{full_code.strip()}\n```",
+                        }
+                    ]
+                full_prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
+            else:
+                prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
+                full_prompt_turn_list += [{"role": "assistant", "content": generation["generation"]}]
+
+            if code == "None" or "**Error**" in full_code:
+                if code == "None":
+                    execution_result = {
+                        "process_status": "failed",
+                        "stderr": "",
+                        "stdout": "Parsing error. Cannot parse the code from output. Please try again and write the code in the format of ```lean4\n<code>\n```",
+                    }
+                elif "**Error**" in full_code:
+                    execution_result = {
+                        "process_status": "failed",
+                        "stderr": "",
+                        "stdout": full_code,
+                    }
+                else:
+                    execution_result = {
+                        "process_status": "failed",
+                        "stderr": "",
+                        "stdout": "Unknown error when parsing code.",
+                    }
+                results_dict["execution_result"] = execution_result
+                results_dict["success"] = False
+                last_error_message = execution_result["stdout"]  # Track for nemotron refinement
+                feedback = self.refine_prompt.fill({"error_message": last_error_message})
+                results_dict["feedback"] = feedback[0]["content"]
+            else:
+                if self.sandbox is None:
+                    raise RuntimeError(
+                        "Sandbox is required for Lean4 code execution but was not configured. "
+                        "Please provide sandbox configuration."
+                    )
+                # execute_code returns (result_dict, session_id) tuple
+                execution_result, _ = await self.sandbox.execute_code(
+                    full_code, language="lean4", timeout=600.0, max_output_characters=1000000
+                )
+                results_dict["execution_result"] = execution_result
+                # Handle timeout (now indicated by process_status in the dict)
+                if execution_result.get("process_status") == "timeout":
+                    results_dict["success"] = False
+                    last_error_message = (
+                        "The compilation timed out. There might be a heavy computation in the code or an endless loop."
+                    )
+                    feedback = self.refine_prompt.fill({"error_message": last_error_message})
+                    results_dict["feedback"] = feedback[0]["content"]
+                elif (
+                    execution_result["process_status"] == "completed"
+                    and "sorry" not in execution_result["stdout"]
+                    and "failed" not in execution_result["stdout"]
+                ):
+                    results_dict["success"] = True
+                else:
+                    error_list = parse_error(execution_result["stdout"])
+                    error_message = get_error_str(full_code, error_list, error_thres=True)
+                    # checking for sorry
+                    if execution_result["process_status"] == "completed":
+                        stdout = execution_result["stdout"].lower()
+                        stderr = execution_result["stderr"].lower()
+                        combined = stdout + "\n" + stderr
+                        if re.search(r"\bsorry\b", combined) is not None:
+                            error_message += "\nThe code contains 'sorry', which means the proof is incomplete."
+                    if error_message.strip() == "":  # something in stderr indicating failure
+                        error_message = execution_result["stderr"][:1000]
+                        if len(execution_result["stderr"]) > 1000:
+                            error_message += "... (truncated)"
+
+                    last_error_message = (
+                        "We use <error></error> to signal the position of the error. \n" + error_message
+                    )
+                    feedback = self.refine_prompt.fill({"error_message": last_error_message})
+                    results_dict["feedback"] = feedback[0]["content"]
+                    results_dict["success"] = False
+
+            results_dict_list.append(results_dict)
+
+            if results_dict["success"]:
+                # This is the case when the code execution is successful. The theorem is proved.
+                break
+            else:
+                if self.cfg.refinement and turn_idx < self.cfg.refinement_max_turns - 1:
+                    prompt_turn_list += feedback
+                    full_prompt_turn_list += feedback
+                else:
+                    # Proving attempt failed.
+                    break
+
+        if len(results_dict_list) > 0 and results_dict_list[-1]["success"]:
+            success = True
+
+        # Usually only need prompt_turn_list for standard SFT, full_prompt_turn_list for SFT with remove_cot enabled, prompt_turn_list_list for SFT with delete_wrong_turns enabled.
+        return {
+            "code_list": code_list,
+            "results_dict_list": results_dict_list,
+            "prompt_turn_list": prompt_turn_list,
+            "turn_idx": turn_idx,
+            "success": success,
+            "full_prompt_turn_list": full_prompt_turn_list,
+            "prompt_turn_list_list": prompt_turn_list_list,
+        }
+
+    async def pass_at_N(self, data_point, data, N=None):
+        if N is None:
+            N = self.cfg.n_pass
+
+        new_results_dict = {"success": False}
+        for i in range(N):
+            results_dict = await self._single_data_point_generate(data_point, data)
+
+            if results_dict["success"]:
+                new_results_dict["success"] = True
+                break
+
+        new_results_dict["results_dict_list"] = results_dict
+        new_results_dict["n_pass"] = i + 1
+
+        return new_results_dict
+
+    async def process_single_datapoint(self, data_point, all_data):
+        result = await self.pass_at_N(data_point, all_data)
+        result_dict = {"generation": result}
+
+        return result_dict
+
+
+GENERATION_TASK_CLASS = ProverTask
+
+
+# Update the hydra main to use the class method
+@hydra.main(version_base=None, config_name="base_prover_config")
+def generate(cfg: ProverConfig):
+    cfg = ProverConfig(_init_nested=True, **cfg)
+    LOG.info("Config used: %s", cfg)
+
+    task = ProverTask(cfg)
+    task.generate()
+
+
+HELP_MESSAGE = get_help_message(
+    ProverConfig,
+    server_params=server_params(),
+    sandbox_params=sandbox_params(),
+)
+
+
+if __name__ == "__main__":
+    if "--help" in sys.argv or "-h" in sys.argv:
+        print(HELP_MESSAGE)
+    else:
+        setup_logging()
+        generate()
diff --git a/nemo_skills/prompt/config/lean4/formal-proof-deepseek-prover-v2-nemotron.yaml b/nemo_skills/prompt/config/lean4/formal-proof-deepseek-prover-v2-nemotron.yaml
new file mode 100644
index 0000000000..54da2461e8
--- /dev/null
+++ b/nemo_skills/prompt/config/lean4/formal-proof-deepseek-prover-v2-nemotron.yaml
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for proving formal theorems in Lean 4.
+# This file is tailored for tasks that involve constructing and verifying proofs
+# of theorems within the Lean 4 formal system.
+
+user: |-
+  Complete the following Lean 4 code:
+
+  ```lean4
+  {header}{informal_prefix}{formal_statement}
+    sorry
+  ```
+
+  First, think through your solution step-by-step. Provide a detailed proof plan outlining the main proof steps and strategies. The plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof.
+
+  Then provide your final answer. Your final answer must be a single, complete Lean 4 markdown code block containing the completed theorem. Do NOT include any text or explanation before or after the code block. Begin with ```lean4 and end with ```.
diff --git a/nemo_skills/prompt/config/lean4/goedel-prover-v2-nemotron.yaml b/nemo_skills/prompt/config/lean4/goedel-prover-v2-nemotron.yaml
new file mode 100644
index 0000000000..db6adac3a5
--- /dev/null
+++ b/nemo_skills/prompt/config/lean4/goedel-prover-v2-nemotron.yaml
@@ -0,0 +1,28 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for proving formal theorems in Lean 4.
+# This file is tailored for tasks that involve constructing and verifying proofs
+# of theorems within the Lean 4 formal system.
+
+user: |-
+  Complete the following Lean 4 code:
+
+  ```lean4
+  {problem}
+  ```
+
+  First, think through your solution step-by-step. Provide a detailed proof plan outlining the main proof steps and strategies. The plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof.
+
+  Then provide your final answer. Your final answer must be a single, complete Lean 4 markdown code block containing the completed theorem. Do NOT include any text or explanation before or after the code block. Begin with ```lean4 and end with ```.
diff --git a/nemo_skills/prompt/config/lean4/goedel-prover-v2-refinement-nemotron.yaml b/nemo_skills/prompt/config/lean4/goedel-prover-v2-refinement-nemotron.yaml
new file mode 100644
index 0000000000..960e4dbc8a
--- /dev/null
+++ b/nemo_skills/prompt/config/lean4/goedel-prover-v2-refinement-nemotron.yaml
@@ -0,0 +1,28 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for proving formal theorems in Lean 4.
+# This file is tailored for tasks that involve constructing and verifying proofs
+# of theorems within the Lean 4 formal system.
+
+user: |-
+  Here is a proof attempt for the following theorem in Lean4.
+
+  {proof_attempt}
+
+  The proof is not correct. Following is the compilation error message:
+
+  {error_message}
+
+  Your task is to fix this proof. Before producing the Lean 4 code to formally prove the given theorem, do a detailed analysis of the error message. Your final answer must be a single, complete Lean 4 markdown code block containing the completed theorem. Do NOT include any text or explanation before or after the code block. Begin with ```lean4 and end with ```.
diff --git a/nemo_skills/prompt/config/lean4/goedel-prover-v2-refinement.yaml b/nemo_skills/prompt/config/lean4/goedel-prover-v2-refinement.yaml
new file mode 100644
index 0000000000..b46fa0a3cc
--- /dev/null
+++ b/nemo_skills/prompt/config/lean4/goedel-prover-v2-refinement.yaml
@@ -0,0 +1,24 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for proving formal theorems in Lean 4.
+# This file is tailored for tasks that involve constructing and verifying proofs
+# of theorems within the Lean 4 formal system.
+
+user: |-
+  The proof is not correct. Following is the compilation error message:
+
+  {error_message}
+
+  Before producing the Lean 4 code to formally prove the given theorem, provide a detailed analysis of the error message.
diff --git a/nemo_skills/prompt/config/lean4/goedel-prover-v2.yaml b/nemo_skills/prompt/config/lean4/goedel-prover-v2.yaml
new file mode 100644
index 0000000000..86d18a33f1
--- /dev/null
+++ b/nemo_skills/prompt/config/lean4/goedel-prover-v2.yaml
@@ -0,0 +1,27 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for proving formal theorems in Lean 4.
+# This file is tailored for tasks that involve constructing and verifying proofs
+# of theorems within the Lean 4 formal system.
+
+user: |-
+  Complete the following Lean 4 code:
+
+  ```lean4
+  {problem}
+  ```
+
+  Before producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.
+  The plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof.