diff --git a/README.md b/README.md
index 45d3b9c3f5..c95718e1fc 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Here are some of the features we support:
   - Evaluate your models on many popular benchmarks.
     - Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
     - Formal proofs in Lean: minif2f, proofnet
-    - Coding skills: scicode, livecodebench, human-eval, mbpp
+    - Coding skills: swe-bench, scicode, livecodebench, human-eval, mbpp
     - Chat/instruction following: ifbench, ifeval, arena-hard
     - General knowledge: mmlu, mmlu-pro, gpqa
     - Long context: ruler, mrcr
diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills
index e71872836e..3d746950ba 100644
--- a/dockerfiles/Dockerfile.nemo-skills
+++ b/dockerfiles/Dockerfile.nemo-skills
@@ -29,4 +29,4 @@ RUN mkdir -p /opt/NeMo-Skills/requirements
 COPY pyproject.toml README.md /opt/NeMo-Skills/
 COPY nemo_skills /opt/NeMo-Skills/nemo_skills/
 COPY requirements /opt/NeMo-Skills/requirements/
-RUN cd /opt/NeMo-Skills && pip install -e .[all]
\ No newline at end of file
+RUN cd /opt/NeMo-Skills && pip install -e .[all]
diff --git a/docs/index.md b/docs/index.md
index 30584e949f..ba74510725 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -16,7 +16,7 @@ Here are some of the features we support:
     - Evaluate your models on many popular benchmarks.
         - Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
         - Formal proofs in Lean: minif2f, proofnet
-        - Coding skills: scicode, livecodebench, human-eval, mbpp
+        - Coding skills: swe-bench, scicode, livecodebench, human-eval, mbpp
         - Chat/instruction following: ifbench, ifeval, arena-hard
         - General knowledge: mmlu, mmlu-pro, gpqa
         - Long context: ruler
diff --git a/docs/pipelines/evaluation.md b/docs/pipelines/evaluation.md
index 2144758982..560c38de40 100644
--- a/docs/pipelines/evaluation.md
+++ b/docs/pipelines/evaluation.md
@@ -182,11 +182,10 @@ Inside [nemo_skills/dataset/gsm8k/\_\_init\_\_.py](https://github.com/NVIDIA/NeM
 
 ```python
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
 ```
 
 The prompt config and default generation arguments are passed to the
diff --git a/nemo_skills/dataset/aime24/__init__.py b/nemo_skills/dataset/aime24/__init__.py
index a97eede9fe..6f161036ea 100644
--- a/nemo_skills/dataset/aime24/__init__.py
+++ b/nemo_skills/dataset/aime24/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = f"++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/aime25/__init__.py b/nemo_skills/dataset/aime25/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/aime25/__init__.py
+++ b/nemo_skills/dataset/aime25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/algebra222/__init__.py b/nemo_skills/dataset/algebra222/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/algebra222/__init__.py
+++ b/nemo_skills/dataset/algebra222/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/amc23/__init__.py b/nemo_skills/dataset/amc23/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/amc23/__init__.py
+++ b/nemo_skills/dataset/amc23/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/answer-judge/__init__.py b/nemo_skills/dataset/answer-judge/__init__.py
index 57ab2144ce..d0331844ee 100644
--- a/nemo_skills/dataset/answer-judge/__init__.py
+++ b/nemo_skills/dataset/answer-judge/__init__.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'judge/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "answer-judgement"
-EVAL_ARGS = "++eval_type=answer_judgement ++generation_key=judgement"
-GENERATION_ARGS = "++generation_key=judgement"
+# using judgement directly in metrics, no need for special evaluation
+EVAL_ARGS = "++eval_type=no-op ++generation_key=judgement"
+GENERATION_ARGS = "++prompt_config=judge/math ++generation_key=judgement"
diff --git a/nemo_skills/dataset/arena-hard/__init__.py b/nemo_skills/dataset/arena-hard/__init__.py
index 9490a71ada..cedc03eb65 100644
--- a/nemo_skills/dataset/arena-hard/__init__.py
+++ b/nemo_skills/dataset/arena-hard/__init__.py
@@ -14,11 +14,10 @@
 
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/default'
 DATASET_GROUP = 'chat'
 METRICS_TYPE = "arena"
-EVAL_ARGS = "++eval_type=arena"
-GENERATION_ARGS = ""
+EVAL_ARGS = "++eval_type=no-op"  # using judgement directly in metrics, no need for special evaluation
+GENERATION_ARGS = "++prompt_config=generic/default"
 
 JUDGE_PIPELINE_ARGS = {
     "generation_module": "nemo_skills.inference.eval.arena_judge",
diff --git a/nemo_skills/dataset/asdiv/__init__.py b/nemo_skills/dataset/asdiv/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/asdiv/__init__.py
+++ b/nemo_skills/dataset/asdiv/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/bfcl_v3/prepare.py b/nemo_skills/dataset/bfcl_v3/prepare.py
index e95537eb19..637404573d 100644
--- a/nemo_skills/dataset/bfcl_v3/prepare.py
+++ b/nemo_skills/dataset/bfcl_v3/prepare.py
@@ -12,17 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import subprocess
-import os
+import argparse
 import glob
-import tempfile
 import json
+import logging
+import os
 import shutil
-from nemo_skills.dataset.bfcl_v3.utils import func_doc_language_specific_pre_processing, convert_to_tool, is_multi_turn, load_file
+import subprocess
+import tempfile
 from pathlib import Path
-from nemo_skills.dataset.bfcl_v3.constants import DATA_FOLDER_PATH, MULTI_TURN_FUNC_DOC_PATH, MULTI_TURN_FUNC_DOC_FILE_MAPPING
-import argparse
-import logging
+
+from nemo_skills.dataset.bfcl_v3.constants import (
+    DATA_FOLDER_PATH,
+    MULTI_TURN_FUNC_DOC_FILE_MAPPING,
+    MULTI_TURN_FUNC_DOC_PATH,
+)
+from nemo_skills.dataset.bfcl_v3.utils import (
+    convert_to_tool,
+    func_doc_language_specific_pre_processing,
+    is_multi_turn,
+    load_file,
+)
 from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
@@ -34,7 +44,6 @@
 
 # Define the configuration as a dictionary
 DEFAULT_SETTINGS = """
-PROMPT_CONFIG = "null"
 DATASET_GROUP = "tool"
 METRICS_TYPE = "bfcl"
 EVAL_ARGS = "++eval_type=bfcl"
@@ -48,7 +57,7 @@ def process_multi_turn_test_case(instance, repo_root_dir):
     """
     Multi-turn test cases don't have the function doc in the prompt. We need to add them here.
     """
-    # Mark whether the instance is single-turn or multi-turn. 
+    # Mark whether the instance is single-turn or multi-turn.
     # This is used to determine if the inference should be done in a single turn or multiple turns.
     if not is_multi_turn(instance["id"]):
         instance["single_turn"] = True
@@ -92,54 +101,54 @@ def process_file(repo_root_dir, input_file, output_file, model_type="llama-nemot
             test_category = instance["id"].rsplit("_", 1)[0]
             if idx == 0:
                 LOG.info(f"Processing {test_category}")
-            
+
             # TODO: Current preprocessing can be model dependent. This could be moved to inference time as well
             # Convert class-based method calls to function calls
             instance = process_multi_turn_test_case(instance, repo_root_dir)
-            
+
             # Convert function calls to tools format and add them to the system prompt
             if "function" in instance:
                 # Add the tools to the system prompt
                 instance["function"] = func_doc_language_specific_pre_processing(instance["function"], test_category)
                 instance["tools"] = convert_to_tool(instance["function"])
-                
+
             f_out.write(json.dumps(instance) + "\n")
 
 
 def download_and_process_bfcl_data(repo_url, subfolder_path, output_dir, file_prefix="BFCL_v3", model_type="nemotron"):
     """
     Download JSON files from the BFCL GitHub repo via cloning
-    
+
     Args:
         repo_url: GitHub repository URL
         subfolder_path: Path to the data subfolder in case of BFCL
         output_dir: Directory to save the processed JSONL files
         file_prefix: Only process files starting with this prefix
-        model_type: Formatting of functions and tools can be model dependent. 
+        model_type: Formatting of functions and tools can be model dependent.
     """
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
             # Clone repository with minimal depth
             print(f"Cloning repository {repo_url} to {temp_dir}")
-            subprocess.run([
-                "git", "clone", "--depth=1", repo_url, temp_dir
-            ], check=True, capture_output=True)
-            
+            subprocess.run(["git", "clone", "--depth=1", repo_url, temp_dir], check=True, capture_output=True)
+
             # Find the target folder
             target_folder = Path(temp_dir) / subfolder_path
-            
+
             if not os.path.exists(target_folder):
                 print(f"Folder {subfolder_path} not found in repository")
-                raise FileNotFoundError(f"Folder {subfolder_path} not found in {repo_url} cloned to {temp_dir}. The structure of BFCL has changed!")
-            
+                raise FileNotFoundError(
+                    f"Folder {subfolder_path} not found in {repo_url} cloned to {temp_dir}. The structure of BFCL has changed!"
+                )
+
             # Find JSON files matching criteria
             json_pattern = os.path.join(target_folder, f"{file_prefix}*.json")
             json_files = glob.glob(json_pattern)
-            
+
             print(f"Found {len(json_files)} JSON files matching pattern")
-            
+
             if not os.path.exists(output_dir):
-                os.makedirs(output_dir) 
+                os.makedirs(output_dir)
 
             processed_files = 0
             for input_file in json_files:
@@ -157,21 +166,21 @@ def download_and_process_bfcl_data(repo_url, subfolder_path, output_dir, file_pr
                 # Copy the original json file to the split directory
                 shutil.copy(input_file, os.path.join(split_dirname, filename))
                 processed_files += 1
-            
+
             print(f"Successfully processed {processed_files} JSON files to {output_dir}")
-            
+
         except subprocess.CalledProcessError as e:
             print(f"Git command failed: {e}")
             print("Make sure git is installed and the repository URL is correct")
 
 
 def main(args):
-    LOG.warning("Currently processing according to the OpenAI model style which works for most models, including Qwen/Llama-Nemotron/DeepSeek.")
+    LOG.warning(
+        "Currently processing according to the OpenAI model style which works for most models, including Qwen/Llama-Nemotron/DeepSeek."
+    )
 
     download_and_process_bfcl_data(
-        REPO_URL, DATA_FOLDER_PATH, 
-        output_dir=os.path.join(os.path.dirname(__file__)),
-        model_type=args.model_type
+        REPO_URL, DATA_FOLDER_PATH, output_dir=os.path.join(os.path.dirname(__file__)), model_type=args.model_type
     )
 
 
@@ -181,6 +190,3 @@ def main(args):
     args = parser.parse_args()
 
     main(args)
-    
-
- 
\ No newline at end of file
diff --git a/nemo_skills/dataset/brumo25/__init__.py b/nemo_skills/dataset/brumo25/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/brumo25/__init__.py
+++ b/nemo_skills/dataset/brumo25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/college_math/__init__.py b/nemo_skills/dataset/college_math/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/college_math/__init__.py
+++ b/nemo_skills/dataset/college_math/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/comp-math-24-25/__init__.py b/nemo_skills/dataset/comp-math-24-25/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/comp-math-24-25/__init__.py
+++ b/nemo_skills/dataset/comp-math-24-25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/gaokao2023en/__init__.py b/nemo_skills/dataset/gaokao2023en/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/gaokao2023en/__init__.py
+++ b/nemo_skills/dataset/gaokao2023en/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/gpqa/__init__.py b/nemo_skills/dataset/gpqa/__init__.py
index cd96824837..f99a39b265 100644
--- a/nemo_skills/dataset/gpqa/__init__.py
+++ b/nemo_skills/dataset/gpqa/__init__.py
@@ -15,9 +15,8 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "eval/aai/mcq-4choices-boxed"
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
 EVAL_ARGS = "++eval_type=multichoice"
 EVAL_SPLIT = "diamond"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-4choices-boxed"
diff --git a/nemo_skills/dataset/gsm-plus/__init__.py b/nemo_skills/dataset/gsm-plus/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/gsm-plus/__init__.py
+++ b/nemo_skills/dataset/gsm-plus/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/gsm8k/__init__.py b/nemo_skills/dataset/gsm8k/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/gsm8k/__init__.py
+++ b/nemo_skills/dataset/gsm8k/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py
index a76ad58e49..9480671a6c 100644
--- a/nemo_skills/dataset/hle/__init__.py
+++ b/nemo_skills/dataset/hle/__init__.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/hle'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/hle"
 EVAL_SPLIT = "text"
 
 # Some answers are not possible to compare symbolically, so have to use a judge model
diff --git a/nemo_skills/dataset/hmmt_feb25/__init__.py b/nemo_skills/dataset/hmmt_feb25/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/hmmt_feb25/__init__.py
+++ b/nemo_skills/dataset/hmmt_feb25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/human-eval/__init__.py b/nemo_skills/dataset/human-eval/__init__.py
index e4d14fcbb1..a1fa64f773 100644
--- a/nemo_skills/dataset/human-eval/__init__.py
+++ b/nemo_skills/dataset/human-eval/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/codegen'
 DATASET_GROUP = 'code'
-METRICS_TYPE = "code"
+METRICS_TYPE = "evalplus"
 EVAL_ARGS = "++eval_type=evalplus ++eval_config.dataset=humaneval"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/codegen"
diff --git a/nemo_skills/dataset/ifbench/__init__.py b/nemo_skills/dataset/ifbench/__init__.py
index b91f5d44fa..4c939bd700 100644
--- a/nemo_skills/dataset/ifbench/__init__.py
+++ b/nemo_skills/dataset/ifbench/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/default'
 DATASET_GROUP = 'chat'
 METRICS_TYPE = "if"
 EVAL_ARGS = "++eval_type=ifbench ++generation_key=response"
-GENERATION_ARGS = "++generation_key=response"
+GENERATION_ARGS = "++generation_key=response ++prompt_config=generic/default"
diff --git a/nemo_skills/dataset/ifeval/__init__.py b/nemo_skills/dataset/ifeval/__init__.py
index cb4c6a81a8..896cec0323 100644
--- a/nemo_skills/dataset/ifeval/__init__.py
+++ b/nemo_skills/dataset/ifeval/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/default'
 DATASET_GROUP = 'chat'
 METRICS_TYPE = "if"
 EVAL_ARGS = "++eval_type=if ++generation_key=response"
-GENERATION_ARGS = "++generation_key=response"
+GENERATION_ARGS = "++prompt_config=generic/default ++generation_key=response"
diff --git a/nemo_skills/dataset/livecodebench-pro/__init__.py b/nemo_skills/dataset/livecodebench-pro/__init__.py
index 108f1f013c..03009e7cfb 100644
--- a/nemo_skills/dataset/livecodebench-pro/__init__.py
+++ b/nemo_skills/dataset/livecodebench-pro/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'eval/livecodebench/python_codegen'
 DATASET_GROUP = 'code'
 METRICS_TYPE = 'code'
 EVAL_ARGS = "++eval_type=livecodebench_pro"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen"
diff --git a/nemo_skills/dataset/livecodebench-x/de/__init__.py b/nemo_skills/dataset/livecodebench-x/de/__init__.py
deleted file mode 100644
index cf4431fc85..0000000000
--- a/nemo_skills/dataset/livecodebench-x/de/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# settings that define how evaluation should be done by default (all can be changed from cmdline)
-DATASET_GROUP = 'code'
-METRICS_TYPE = 'livecodebench'
-EVAL_SPLIT = 'test_v6_2408_2505'
-EVAL_ARGS = "++eval_type=livecodebench"
-GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen"
diff --git a/nemo_skills/dataset/livecodebench-x/en/__init__.py b/nemo_skills/dataset/livecodebench-x/en/__init__.py
deleted file mode 100644
index cf4431fc85..0000000000
--- a/nemo_skills/dataset/livecodebench-x/en/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# settings that define how evaluation should be done by default (all can be changed from cmdline)
-DATASET_GROUP = 'code'
-METRICS_TYPE = 'livecodebench'
-EVAL_SPLIT = 'test_v6_2408_2505'
-EVAL_ARGS = "++eval_type=livecodebench"
-GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen"
diff --git a/nemo_skills/dataset/livecodebench-x/prepare.py b/nemo_skills/dataset/livecodebench-x/prepare.py
deleted file mode 100644
index 860f5e4c76..0000000000
--- a/nemo_skills/dataset/livecodebench-x/prepare.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import os
-from datetime import datetime
-from pathlib import Path
-
-from datasets import load_dataset
-from dateutil.relativedelta import relativedelta
-
-
-class PromptConstants:
-    # reference: https://github.com/QwenLM/Qwen2.5-Coder/blob/main/qwencoder-eval/reasoning/livecode_bench_cot/lcb_runner_cq/prompts/code_generation.py#L31
-    FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
-    FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the python program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
-
-
-def parse_data(release_version='release_latest'):
-    data = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version)
-    # data has the following fields
-    # question_title: str
-    # question_content: str
-    # platform: Platform
-    # question_id: str
-    # contest_id: str
-    # contest_date: datetime
-    # starter_code: str
-    # difficulty: Difficulty
-    # public_test_cases: list[Test]
-    # private_test_cases: list[Test]
-    # metadata: dict
-    return data
-
-
-def get_first_last_day(year_month_str):
-    try:
-        date_obj = datetime.strptime(year_month_str, "%Y-%m")
-        first_day = date_obj.date().replace(day=1)
-        last_day = (date_obj + relativedelta(months=1, days=-1)).date()
-        return first_day, last_day
-    except ValueError:
-        raise ValueError("Invalid date format. Please use '%Y-%m'.")
-
-
-def parse_month_range(start_date, end_date):
-    try:
-        start_date, _ = get_first_last_day(start_date)
-        _, end_date = get_first_last_day(end_date)
-        return start_date, end_date
-    except ValueError as e:
-        raise ValueError(str(e))
-
-
-def clean_data(dataset):
-    def map_fn(data):
-        question = data["question_content"] + "\n\n"
-        if data["starter_code"]:
-            question += f"{PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
-            question += f"```python\n{data['starter_code']}\n```\n\n"
-        else:
-            question += f"{PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n\n"
-            question += f"```python\n# YOUR CODE HERE\n```\n\n"
-
-        data["task_id"] = data["question_id"]
-        data['question'] = question.replace('    ', '\t')
-        return data
-
-    remove_columns = [
-        'question_title',
-        'contest_id',
-        'public_test_cases',
-        'private_test_cases',
-        'metadata',
-        'question_content',
-        'platform',
-        'question_id',
-        'starter_code',
-    ]
-    dataset = dataset.map(map_fn, remove_columns=remove_columns)
-    return dataset
-
-
-def prepare(start_date, end_date, release_version, output_dir):
-    start_date, end_date = parse_month_range(start_date, end_date)
-    start_yymm = start_date.strftime("%y%m")
-    end_yymm = end_date.strftime("%y%m")
-    output_file_path = os.path.join(output_dir, f"test_{release_version}_{start_yymm}_{end_yymm}.jsonl")
-
-    assert release_version in ["v1", "v2", "v3", "v4", "v5", "v6"]
-
-    data = parse_data(release_version=f"release_{release_version}")
-    data = clean_data(data)
-    print("Len of data: ", len(data))
-
-    print("Writing to file...")
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    with open(output_file_path, 'w') as f:
-        for problem in data:
-            input_date = datetime.strptime(problem['contest_date'], '%Y-%m-%dT%H:%M:%S').date()
-            if start_date <= input_date <= end_date:
-                json.dump(
-                    {
-                        "task_id": problem["task_id"],
-                        "question": problem["question"],
-                        "difficulty": problem["difficulty"],
-                        "subset_for_metrics": problem["difficulty"],
-                        "release_version": release_version,
-                    },
-                    f,
-                )
-                f.write('\n')
-
-
-DEFAULT_SPLITS = [
-    ('v5', '2024-08', '2025-02'),  # previous default
-    ('v5', '2024-07', '2024-12'),  # aai split
-    ('v6', '2024-08', '2025-05'),  # current default in lb
-]
-
-
-if __name__ == '__main__':
-    # Write an argparse to a json file, read it in and parse it
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--output_dir', type=str, default=str(Path(__file__).parent))
-    parser.add_argument('--release_version', type=str, default='all')
-    parser.add_argument('--start_date', type=str, default='all', help="End date in YYYY-MM format")
-    parser.add_argument('--end_date', type=str, default='all', help="End date in YYYY-MM format")
-
-    args = parser.parse_args()
-
-    if args.release_version == 'all' and args.start_date == 'all' and args.end_date == 'all':
-        # Prepare all splits
-        for release_version, start_date, end_date in DEFAULT_SPLITS:
-            print(f"Processing data for {release_version} from {start_date} to {end_date}")
-            prepare(start_date, end_date, release_version, args.output_dir)
-    else:
-        if args.release_version == 'all' or args.start_date == 'all' or args.end_date == 'all':
-            raise ValueError(
-                "If preparing a custom split, you must specify all "
-                "--release_version, --start_date, and --end_date arguments."
-            )
-        prepare(args.start_date, args.end_date, args.release_version, args.output_dir)
-
-    # test_v5_2408_2502.jsonl: 279 samples
-    # test_v5_2410_2502.jsonl: 166 samples
-    # test_v5_2410_2504.jsonl: 166 samples
-    # test_v6_2408_2502.jsonl: 374 samples
-    # test_v6_2410_2502.jsonl: 261 samples
-    # test_v6_2410_2504.jsonl: 341 samples
diff --git a/nemo_skills/dataset/livecodebench/__init__.py b/nemo_skills/dataset/livecodebench/__init__.py
index 8595a4c245..cf4431fc85 100644
--- a/nemo_skills/dataset/livecodebench/__init__.py
+++ b/nemo_skills/dataset/livecodebench/__init__.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'eval/livecodebench/python_codegen'
 DATASET_GROUP = 'code'
 METRICS_TYPE = 'livecodebench'
 EVAL_SPLIT = 'test_v6_2408_2505'
 EVAL_ARGS = "++eval_type=livecodebench"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen"
diff --git a/nemo_skills/dataset/math-500/__init__.py b/nemo_skills/dataset/math-500/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/math-500/__init__.py
+++ b/nemo_skills/dataset/math-500/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/math-odyssey/__init__.py b/nemo_skills/dataset/math-odyssey/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/math-odyssey/__init__.py
+++ b/nemo_skills/dataset/math-odyssey/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/math/__init__.py b/nemo_skills/dataset/math/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/math/__init__.py
+++ b/nemo_skills/dataset/math/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/mawps/__init__.py b/nemo_skills/dataset/mawps/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/mawps/__init__.py
+++ b/nemo_skills/dataset/mawps/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/mbpp/__init__.py b/nemo_skills/dataset/mbpp/__init__.py
index 1537690edc..f7d58a9184 100644
--- a/nemo_skills/dataset/mbpp/__init__.py
+++ b/nemo_skills/dataset/mbpp/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/codegen'
 DATASET_GROUP = 'code'
-METRICS_TYPE = "code"
+METRICS_TYPE = "evalplus"
 EVAL_ARGS = "++eval_type=evalplus ++eval_config.dataset=mbpp"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/codegen"
diff --git a/nemo_skills/dataset/minerva_math/__init__.py b/nemo_skills/dataset/minerva_math/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/minerva_math/__init__.py
+++ b/nemo_skills/dataset/minerva_math/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/minif2f/__init__.py b/nemo_skills/dataset/minif2f/__init__.py
index 74bf0ceeb0..a9510c842c 100644
--- a/nemo_skills/dataset/minif2f/__init__.py
+++ b/nemo_skills/dataset/minif2f/__init__.py
@@ -14,9 +14,8 @@
 
 
 # Default evaluation and generation settings for the minif2f dataset
-PROMPT_CONFIG = 'lean4/formal-proof'
 DATASET_GROUP = 'lean4'
 METRICS_TYPE = "lean4-proof"
 EVAL_ARGS = "++eval_type=lean4-proof"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=lean4/formal-proof"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/mmlu-pro/__init__.py b/nemo_skills/dataset/mmlu-pro/__init__.py
index 08638b7189..53ae4e2cf0 100644
--- a/nemo_skills/dataset/mmlu-pro/__init__.py
+++ b/nemo_skills/dataset/mmlu-pro/__init__.py
@@ -15,8 +15,7 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "eval/aai/mcq-10choices-boxed"
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
 EVAL_ARGS = "++eval_type=multichoice"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-10choices-boxed"
diff --git a/nemo_skills/dataset/mmlu-redux/__init__.py b/nemo_skills/dataset/mmlu-redux/__init__.py
index ef64ab4154..20ec7704ae 100644
--- a/nemo_skills/dataset/mmlu-redux/__init__.py
+++ b/nemo_skills/dataset/mmlu-redux/__init__.py
@@ -14,9 +14,8 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "generic/general-boxed"
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
 EVAL_ARGS = "++eval_type=multichoice"
 
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/general-boxed"
diff --git a/nemo_skills/dataset/mmlu/__init__.py b/nemo_skills/dataset/mmlu/__init__.py
index c5d17436f8..adcd545590 100644
--- a/nemo_skills/dataset/mmlu/__init__.py
+++ b/nemo_skills/dataset/mmlu/__init__.py
@@ -14,8 +14,7 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "eval/aai/mcq-4choices-boxed"
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
 EVAL_ARGS = "++eval_type=multichoice"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-4choices-boxed"
diff --git a/nemo_skills/dataset/mrcr/__init__.py b/nemo_skills/dataset/mrcr/__init__.py
index 5b2ff73cc0..9975267ead 100644
--- a/nemo_skills/dataset/mrcr/__init__.py
+++ b/nemo_skills/dataset/mrcr/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 EVAL_SPLIT = 'all'
-PROMPT_CONFIG = 'null'
 DATASET_GROUP = 'long-context'
 METRICS_TYPE = 'mrcr'
 EVAL_ARGS = '++eval_type=mrcr'
diff --git a/nemo_skills/dataset/olympiadbench/__init__.py b/nemo_skills/dataset/olympiadbench/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/olympiadbench/__init__.py
+++ b/nemo_skills/dataset/olympiadbench/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/omni-math/__init__.py b/nemo_skills/dataset/omni-math/__init__.py
index 122c4c81b3..7b3ed8b515 100644
--- a/nemo_skills/dataset/omni-math/__init__.py
+++ b/nemo_skills/dataset/omni-math/__init__.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
 
 # some answers are not possible to compare symbolically, so have to use a judge model
 # setting openai judge by default, but can be overriden from command line for a locally hosted model
diff --git a/nemo_skills/dataset/proofnet/__init__.py b/nemo_skills/dataset/proofnet/__init__.py
index 641a3258c4..3bbf99c737 100644
--- a/nemo_skills/dataset/proofnet/__init__.py
+++ b/nemo_skills/dataset/proofnet/__init__.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 
 # Default evaluation and generation settings for the minif2f dataset
-PROMPT_CONFIG = 'lean4/formal-proof'
 DATASET_GROUP = 'lean4'
 METRICS_TYPE = "lean4-proof"
 EVAL_ARGS = "++eval_type=lean4-proof"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=lean4/formal-proof"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/putnam-bench/__init__.py b/nemo_skills/dataset/putnam-bench/__init__.py
index 9349d2f68f..a815de3ff1 100644
--- a/nemo_skills/dataset/putnam-bench/__init__.py
+++ b/nemo_skills/dataset/putnam-bench/__init__.py
@@ -14,9 +14,8 @@
 
 
 # Default evaluation and generation settings for the PutnamBench
-PROMPT_CONFIG = 'lean4/formal-proof'
 DATASET_GROUP = 'lean4'
 METRICS_TYPE = "lean4-proof"
 EVAL_ARGS = "++eval_type=lean4-proof"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=lean4/formal-proof"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/ruler/prepare.py b/nemo_skills/dataset/ruler/prepare.py
index b8602136e3..f8ef59ec6b 100644
--- a/nemo_skills/dataset/ruler/prepare.py
+++ b/nemo_skills/dataset/ruler/prepare.py
@@ -23,11 +23,11 @@
 from pathlib import Path
 
 DEFAULT_SETTINGS = """
-PROMPT_CONFIG = "generic/default"
 DATASET_GROUP = "long-context"
 METRICS_TYPE = "ruler"
 EVAL_ARGS = "++eval_type=ruler ++eval_config.match_type={match_type}"
 GENERATION_ARGS = (
+    "++prompt_config=generic/default "
     "++inference.tokens_to_generate={tokens_to_generate} "
     "++prefix_generation_to_response=True "
     "++continue_prefix_generation=True"
diff --git a/nemo_skills/dataset/scicode/__init__.py b/nemo_skills/dataset/scicode/__init__.py
index 29bdb7f074..9173c7034e 100644
--- a/nemo_skills/dataset/scicode/__init__.py
+++ b/nemo_skills/dataset/scicode/__init__.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'eval/scicode/default'
 DATASET_GROUP = 'code'
 METRICS_TYPE = 'scicode'
 # generation is a dictionary instead of string and remove_thinking is done during inference
 EVAL_ARGS = "++eval_type=scicode ++remove_thinking=False"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=eval/scicode/default"
 GENERATION_MODULE = "nemo_skills.inference.eval.scicode"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/svamp/__init__.py b/nemo_skills/dataset/svamp/__init__.py
index a97eede9fe..77c3830552 100644
--- a/nemo_skills/dataset/svamp/__init__.py
+++ b/nemo_skills/dataset/svamp/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/livecodebench-x/__init__.py b/nemo_skills/dataset/swe-bench/__init__.py
similarity index 77%
rename from nemo_skills/dataset/livecodebench-x/__init__.py
rename to nemo_skills/dataset/swe-bench/__init__.py
index cf4431fc85..cc34fed9b9 100644
--- a/nemo_skills/dataset/livecodebench-x/__init__.py
+++ b/nemo_skills/dataset/swe-bench/__init__.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
+EVAL_SPLIT = "default"
 DATASET_GROUP = 'code'
-METRICS_TYPE = 'livecodebench'
-EVAL_SPLIT = 'test_v6_2408_2505'
-EVAL_ARGS = "++eval_type=livecodebench"
-GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen"
+METRICS_TYPE = "swe-bench"
+EVAL_ARGS = "++eval_type=no-op"  # evaluation is fused with generation for efficiency
+GENERATION_ARGS = ""
+GENERATION_MODULE = "nemo_skills.inference.eval.swebench"
diff --git a/nemo_skills/dataset/swe-bench/prepare.py b/nemo_skills/dataset/swe-bench/prepare.py
new file mode 100644
index 0000000000..6f7ac92a18
--- /dev/null
+++ b/nemo_skills/dataset/swe-bench/prepare.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+
+import datasets
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--container_formatter",
+        type=str,
+        default="docker://swebench/sweb.eval.x86_64.{instance_id}",
+        help="Container formatter string. You can download .sif containers and store them in a mounted "
+        "directory which you can reference here to avoid redownloading all the time.",
+    )  # TODO: add download script
+    parser.add_argument("--split", type=str, default="test", help="Swe-Bench dataset split to use")
+    parser.add_argument(
+        "--setup", type=str, default="default", help="Setup name (used as nemo-skills split parameter)."
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="princeton-nlp/SWE-bench_Verified",
+        help="Dataset name to load",
+    )
+    args = parser.parse_args()
+
+    dataset_name = args.dataset_name
+    split = args.split
+    container_formatter = args.container_formatter
+
+    dataset = datasets.load_dataset(path=dataset_name, split=split)
+    output_file = Path(__file__).parent / f"{args.setup}.jsonl"
+    dataset = dataset.map(lambda example: {**example, "container_formatter": container_formatter})
+    dataset = dataset.add_column("container_id", list(range(len(dataset))))
+    dataset = dataset.add_column("dataset_name", [dataset_name] * len(dataset))
+    dataset = dataset.add_column("split", [split] * len(dataset))
+    dataset.to_json(output_file, orient="records", lines=True)
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
index d297ed57e5..a11af8a3cd 100644
--- a/nemo_skills/evaluation/evaluator/__init__.py
+++ b/nemo_skills/evaluation/evaluator/__init__.py
@@ -20,9 +20,9 @@
 from nemo_skills.evaluation.evaluator.ifeval import eval_if
 from nemo_skills.evaluation.evaluator.math import eval_lean4_proof, eval_lean4_statement, eval_math
 from nemo_skills.evaluation.evaluator.mcq import eval_mcq
+from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
 from nemo_skills.evaluation.evaluator.ruler import eval_ruler
 from nemo_skills.evaluation.evaluator.scicode import eval_scicode
-from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
 
 
 def dummy_eval(cfg):
@@ -35,8 +35,7 @@ def dummy_eval(cfg):
     'if': eval_if,
     'ifbench': eval_ifbench,
     'bfcl': eval_bfcl,
-    'arena': dummy_eval,
-    'answer_judgement': dummy_eval,
+    'no-op': dummy_eval,
     'lean4-proof': eval_lean4_proof,
     'lean4-statement': eval_lean4_statement,
     'multichoice': eval_mcq,
diff --git a/nemo_skills/evaluation/metrics/code_metrics.py b/nemo_skills/evaluation/metrics/code_metrics.py
index bed30ca1d9..20bd4fb22b 100644
--- a/nemo_skills/evaluation/metrics/code_metrics.py
+++ b/nemo_skills/evaluation/metrics/code_metrics.py
@@ -15,7 +15,7 @@
 from nemo_skills.evaluation.metrics.base import BaseMetrics
 
 
-class CodeMetrics(BaseMetrics):
+class EvalPlusMetrics(BaseMetrics):
     def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         return {
             "passing_base_tests": prediction['is_correct'],
@@ -46,6 +46,23 @@ def update(self, predictions):
         self._compute_pass_at_k(predictions=predictions)
 
 
+class SweBenchMetrics(BaseMetrics):
+    def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
+        return {
+            "issues_resolved": prediction['swe-bench-metrics']['resolved'],
+            "no_patch": not prediction['swe-bench-metrics']['patch_exists'],
+            "patch_cant_apply": not prediction['swe-bench-metrics']['patch_successfully_applied'],
+        }
+
+    @classmethod
+    def get_incorrect_sample(cls, prediction: dict) -> dict:
+        return {"swe-bench-metrics": {"resolved": False, "patch_exists": True, "patch_successfully_applied": True}}
+
+    def update(self, predictions):
+        super().update(predictions)
+        self._compute_pass_at_k(predictions=predictions)
+
+
 class SciCodeMetrics(BaseMetrics):
     def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         subtask_status_list = prediction['eval_status']
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
index c3c754f777..34d64cdc22 100644
--- a/nemo_skills/evaluation/metrics/map_metrics.py
+++ b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -14,12 +14,17 @@
 from nemo_skills.evaluation.metrics.answer_judgement_metrics import AnswerJudgementMetrics
 from nemo_skills.evaluation.metrics.arena_metrics import ArenaMetrics
 from nemo_skills.evaluation.metrics.bfcl_metrics import BFCLMetrics
-from nemo_skills.evaluation.metrics.code_metrics import CodeMetrics, LiveCodeBenchMetrics, SciCodeMetrics
+from nemo_skills.evaluation.metrics.code_metrics import (
+    EvalPlusMetrics,
+    LiveCodeBenchMetrics,
+    SciCodeMetrics,
+    SweBenchMetrics,
+)
 from nemo_skills.evaluation.metrics.if_metrics import IFMetrics
 from nemo_skills.evaluation.metrics.lean4_metrics import Lean4Metrics
 from nemo_skills.evaluation.metrics.math_metrics import MathMetrics
-from nemo_skills.evaluation.metrics.ruler_metrics import RulerMetrics
 from nemo_skills.evaluation.metrics.mrcr_metrics import MRCRMetrics
+from nemo_skills.evaluation.metrics.ruler_metrics import RulerMetrics
 
 METRICS_MAP = {
     "math": MathMetrics,
@@ -28,11 +33,12 @@
     "answer-judgement": AnswerJudgementMetrics,
     "arena": ArenaMetrics,
     "bfcl": BFCLMetrics,
-    "code": CodeMetrics,
+    "evalplus": EvalPlusMetrics,
     "if": IFMetrics,
     "multichoice": MathMetrics,
     "ruler": RulerMetrics,
     "livecodebench": LiveCodeBenchMetrics,
+    "swe-bench": SweBenchMetrics,
     "scicode": SciCodeMetrics,
     "mrcr": MRCRMetrics,
 }
diff --git a/nemo_skills/inference/eval/arena_judge.py b/nemo_skills/inference/eval/arena_judge.py
index 03080827dd..4064be010a 100644
--- a/nemo_skills/inference/eval/arena_judge.py
+++ b/nemo_skills/inference/eval/arena_judge.py
@@ -51,7 +51,6 @@ class ArenaJudgeTask(GenerationTask):
     def __init__(self, cfg: ArenaJudgeConfig):
         super().__init__(cfg)
 
-
     def log_example_prompt(self, all_data):
         data_point = deepcopy(all_data[0])
 
@@ -62,7 +61,9 @@ def log_example_prompt(self, all_data):
 
         data_point['answer_1'] = data_point['generation']
         data_point['answer_2'] = data_point['baseline_answer']
-        LOG.info("Example prompt:\nData dictionary: %s\nPrompt: %s", data_point, self.fill_prompt(data_point, all_data))
+        LOG.info(
+            "Example prompt:\nData dictionary: %s\nPrompt: %s", data_point, self.fill_prompt(data_point, all_data)
+        )
 
     async def process_single_datapoint(self, data_point, all_data):
         gen_base_data = data_point.copy()
@@ -76,7 +77,7 @@ async def process_single_datapoint(self, data_point, all_data):
         # Make two async calls instead of one batch call
         llm_output_1, llm_output_2 = await asyncio.gather(
             super().process_single_datapoint(gen_base_data, all_data),
-            super().process_single_datapoint(base_gen_data, all_data)
+            super().process_single_datapoint(base_gen_data, all_data),
         )
 
         return {
diff --git a/nemo_skills/inference/eval/scicode.py b/nemo_skills/inference/eval/scicode.py
index 118b496a17..8f4573a16e 100644
--- a/nemo_skills/inference/eval/scicode.py
+++ b/nemo_skills/inference/eval/scicode.py
@@ -49,9 +49,6 @@ class SciCodeGenerationConfig(GenerateSolutionsConfig):
 
 
 class SciCodeGenerationTask(GenerationTask):
-    def __init__(self, cfg: SciCodeGenerationConfig):
-        super().__init__(cfg)
-
     def log_example_prompt(self, data):
         """Scicode is multi-call benchmark, so we can't print a single prompt."""
         return
diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py
index e69de29bb2..40453dba67 100644
--- a/nemo_skills/inference/eval/swebench.py
+++ b/nemo_skills/inference/eval/swebench.py
@@ -0,0 +1,519 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import glob
+import json
+import logging
+import os
+import shlex
+import sys
+from dataclasses import field
+from enum import Enum
+from pathlib import Path
+
+import hydra
+import tomlkit
+
+from nemo_skills.inference.generate import GenerationTask
+from nemo_skills.inference.model import server_params
+from nemo_skills.prompt.utils import get_config_path
+from nemo_skills.utils import get_help_message, get_logger_name, nested_dataclass, setup_logging
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+class SupportedAgentFrameworks(str, Enum):
+    swe_agent = "swe_agent"
+    openhands = "openhands"
+
+
+# Like nemo_skills.inference.generate.InferenceConfig, except most parameters are not passed by default
+# because they may not be supported by all LLM servers or agent frameworks.
+# tokens_to_generate is purposefully unlimited by default for SWE-bench.
+@nested_dataclass(kw_only=True)
+class SweBenchInferenceConfig:
+    temperature: float = 0.0  # Temperature of 0 means greedy decoding
+    top_k: int | None = None
+    top_p: float = 0.95
+    min_p: float | None = None
+    random_seed: int | None = None
+    tokens_to_generate: int | None = None
+    repetition_penalty: float | None = None
+    top_logprobs: int | None = None
+
+
+# Converts the parameter names above to the corresponding OpenAI parameter names.
+NS_TO_OPENAI_PARAM = {
+    # Officially part of the OpenAI Chat Completions API.
+    "tokens_to_generate": "max_tokens",
+    "top_logprobs": "top_logprobs",
+    "random_seed": "seed",
+    # Not in the official API, but still supported by some servers, e.g. vllm.
+    "top_k": "top_k",
+    "min_p": "min_p",
+    "repetition_penalty": "repetition_penalty",
+    # temperature and top_p are passed as separate SWE-agent parameters.
+}
+
+
+# Converts the parameter names above to the corresponding parameters in OpenHands's LLM config.
+# https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/core/config/llm_config.py#L12
+NS_TO_OPENHANDS_PARAM = {
+    # Supported on OpenHands's side. top_k is not OpenAI-compatible and so may break some servers.
+    "tokens_to_generate": "max_output_tokens",
+    "top_k": "top_k",
+    "random_seed": "seed",
+    # Not supported by OpenHands. Nemo-Skills will raise an error if they are passed.
+    "min_p": None,
+    "repetition_penalty": None,
+    "top_logprobs": None,
+    # temperature and top_p are passed separately.
+}
+
+
+# not inheriting since most parameters are not supported because we don't use our model client here
+# TODO: should we fix that?
+@nested_dataclass(kw_only=True)
+class SweBenchGenerationConfig:
+    input_file: str  # Path to the input file with data
+    output_file: str  # Where to save the generations
+
+    agent_framework: SupportedAgentFrameworks  # Which agentic framework to use
+
+    # SWE-agent/OpenHands configuration file path. Can be specified in the same way as ns prompt configs
+    # If None, will use the default for the chosen framework
+    agent_config: str | None = None
+    agent_max_turns: int = 100  # Max iterations for the agent
+
+    swebench_tests_timeout: int = 60 * 30  # Timeout for the tests after applying the patch, in seconds
+
+    inference: SweBenchInferenceConfig = field(default_factory=SweBenchInferenceConfig)  # LLM call parameters
+    # Inference server configuration {server_params}
+    server: dict = field(default_factory=dict)
+
+    max_samples: int = -1  # If > 0, will stop after generating this many samples. Useful for debugging
+    skip_filled: bool = False  # If True, will skip the generations that are already in the output file
+
+    # maximum number of concurrent requests to the server for the async loop
+    # if sync loop is used, this is the batch size
+    max_concurrent_requests: int = 512
+    # chunk the dataset into equal sized parts and index into them
+    num_chunks: int | None = None  # if specified, will split the data into chunks and only generate for one chunk
+    chunk_id: int | None = None  # if specified, will index the specified chunk only
+
+    # if False, will not add num_generated_tokens and generation_time values.
+    # Useful when running judge jobs to keep the original generation statistics
+    add_generation_stats: bool = True
+    generation_key: str = "generation"
+    async_position_key: str = "_async_position"  # key to use for preserving position in async loop in data dict
+    dry_run: bool = False
+
+    # if True, will move full generation to _full_generation key and keep cfg.generation_key without thinking tokens
+    remove_thinking: bool = False
+    thinking_begin: str = "<think>"
+    thinking_end: str = "</think>"
+
+
+cs = hydra.core.config_store.ConfigStore.instance()
+cs.store(name="base_swebench_generation_config", node=SweBenchGenerationConfig)
+
+
+class SweBenchGenerationTask(GenerationTask):
+    def __init__(self, cfg: SweBenchGenerationConfig):
+        self.cfg = cfg
+
+        LOG.info(
+            "Async loop is maintaining %d generations in parallel. "
+            "Use max_concurrent_requests to control the number of concurrent requests.",
+            self.cfg.max_concurrent_requests,
+        )
+        self.semaphore = asyncio.Semaphore(self.cfg.max_concurrent_requests)
+
+        # output_lock will be initialized when async_loop is called
+        self.output_lock = None
+
+        # needs to skip completed samples, not used otherwise
+        self.cfg.prompt_format = "ns"
+
+    def log_example_prompt(self, data):
+        return
+
+    def setup_prompt(self):
+        return
+
+    def setup_llm(self):
+        return
+
+    async def _execute_container_command(
+        self, data_point, command, expected_file_pattern, mode, max_retries=3, timeout=100000
+    ):
+        """Execute a command in an Apptainer container with retry logic."""
+        container_name = data_point["container_formatter"].format(
+            instance_id=data_point['instance_id'].replace('__', '_1776_')
+        )
+
+        # Create logs directory if it doesn't exist
+        logs_dir = self.output_dir / "apptainer_logs"
+        logs_dir.mkdir(exist_ok=True)
+        log_file_path = logs_dir / f"{data_point['instance_id']}_{mode}.log"
+        LOG.info("Starting execution of an apptainer command. Logs are available at %s", log_file_path)
+
+        # Fix localhost URLs not working sometimes
+        command = f"echo '127.0.0.1 localhost' >/etc/hosts && {command}"
+
+        # Launch Apptainer container and execute the command
+        apptainer_cmd = (
+            f"apptainer exec --writable-tmpfs --no-mount home,tmp,bind-paths "
+            f"--mount type=bind,src=/nemo_run/code,dst=/nemo_run/code "
+            f"--mount type=bind,src={self.output_dir},dst=/trajectories_mount "
+            f" {container_name} bash -c {shlex.quote(command)}"
+        )
+
+        # Retry apptainer command up to max_retries times
+        for attempt in range(max_retries):
+            try:
+                # Stream output to log file as it appears
+                with open(log_file_path, 'w') as log_file:
+                    try:
+                        # Create async subprocess
+                        process = await asyncio.create_subprocess_shell(
+                            apptainer_cmd, stdout=log_file, stderr=log_file
+                        )
+                        # Wait for completion with timeout
+                        await asyncio.wait_for(process.communicate(), timeout=timeout)
+
+                        if process.returncode != 0:
+                            raise ValueError(f"Command failed with return code {process.returncode}")
+
+                    except asyncio.TimeoutError:
+                        # Kill the process if it's still running
+                        if process.returncode is None:
+                            process.kill()
+                            await process.wait()
+                        attempt = max_retries  # Force exit the loop on timeout
+                        raise ValueError("Command timed out")
+
+                # Look for the expected file
+                pred_files = glob.glob(expected_file_pattern, recursive=True)
+
+                if len(pred_files) == 1:
+                    # Success, break out of retry loop
+                    return pred_files[0]
+                else:
+                    raise ValueError(
+                        f"Expected exactly one file matching {expected_file_pattern} for {data_point['instance_id']}, "
+                        f"found {len(pred_files)}."
+                    )
+            except Exception as e:
+                if attempt < max_retries - 1:
+                    LOG.warning(
+                        "Attempt %d failed for instance %s. Retrying...",
+                        attempt + 1,
+                        data_point['instance_id'],
+                    )
+                    continue
+                else:
+                    LOG.error("All %d attempts failed for instance %s", max_retries, data_point['instance_id'])
+                    LOG.error("Apptainer command failed. Check logs at: %s", log_file_path)
+                    raise ValueError(
+                        f"Job failed for {data_point['instance_id']}. Check logs at: {log_file_path}. "
+                        f"Expected exactly one file matching {expected_file_pattern}, "
+                        f"found {len(pred_files) if 'pred_files' in locals() else 'unknown'}."
+                    )
+
+    async def _run_swe_agent(self, data_point, api_base):
+        """
+        Runs SWE-agent on one instance.
+        Returns the absolute (not mounted) path to a .jsonl file in the SWE-bench evaluation format.
+        """
+        if self.cfg.agent_config is None:
+            self.cfg.agent_config = "eval/swe-bench/swe-agent/default"
+
+        completion_kwargs = {
+            openai_param: getattr(self.cfg.inference, ns_param)
+            for ns_param, openai_param in NS_TO_OPENAI_PARAM.items()
+            if getattr(self.cfg.inference, ns_param) is not None
+        }
+        if "top_logprobs" in completion_kwargs:
+            completion_kwargs["logprobs"] = True
+
+        swe_agent_cmd = (
+            # first installing swe-agent repo
+            "curl -LsSf https://astral.sh/uv/install.sh | sh && "
+            "source /root/.local/bin/env && "
+            "cd /root && "
+            "git clone https://github.com/SWE-agent/SWE-agent.git && "
+            "cd SWE-agent && "
+            "uv venv --python 3.12 venv && "
+            "source venv/bin/activate && "
+            "uv pip install -e . && "
+            # then running the agent
+            f"/root/SWE-agent/venv/bin/python -m sweagent run "
+            f"    --config {get_config_path(self.cfg.agent_config)} "
+            f"    --agent.model.name hosted_vllm/{self.cfg.server.model} "
+            f"    --agent.model.api_base {api_base} "
+            f"    --agent.model.temperature {self.cfg.inference.temperature} "
+            f"    --agent.model.top_p {self.cfg.inference.top_p} "
+            f"    --agent.model.completion_kwargs {shlex.quote(json.dumps(completion_kwargs))} "
+            f"    --agent.model.per_instance_call_limit {self.cfg.agent_max_turns} "
+            f"    --env.deployment.type local "
+            f"    --env.repo.type preexisting "
+            f"    --env.repo.repo_name testbed "
+            f"    --env.repo.base_commit {data_point['base_commit']} "
+            f"    --problem_statement.text {shlex.quote(data_point['problem_statement'])} "
+            f"    --problem_statement.id {data_point['instance_id']} && "
+            # move trajectories to the mounted directory
+            f"cp -r trajectories /trajectories_mount/"
+        )
+
+        # Execute SWE-agent command
+        search_path = os.path.join(self.output_dir / "trajectories", "**", f"{data_point['instance_id']}.pred")
+        pred_file = await self._execute_container_command(data_point, swe_agent_cmd, search_path, mode="agent")
+
+        with open(pred_file, 'r') as f:
+            trajectory_dict = json.loads(f.read().strip())
+
+        # need to rename .pred to .jsonl
+        pred_jsonl_file = pred_file.replace('.pred', '.jsonl')
+        with open(pred_jsonl_file, 'w') as f:
+            f.write(json.dumps(trajectory_dict))
+
+        # TODO: get num_generated_tokens and other stats from .traj file
+        # looks like data['info']['model_stats']
+        # {'instance_cost': 0, 'tokens_sent': 40858, 'tokens_received': 1775, 'api_calls': 9}
+
+        return pred_jsonl_file
+
+    async def _run_openhands(self, data_point, api_base):
+        """
+        Runs OpenHands on one instance.
+        Returns the absolute (not mounted) path to a .jsonl file in the SWE-bench evaluation format.
+        """
+        if self.cfg.agent_config is None:
+            self.cfg.agent_config = "eval/swe-bench/openhands/default"
+
+        # Add parameters to config.toml
+
+        with open(get_config_path(self.cfg.agent_config, config_extension="toml"), "r") as f:
+            config = tomlkit.parse(f.read())
+
+        config["llm"]["model"] |= {
+            "model": self.cfg.server.model,
+            "base_url": api_base,
+            "temperature": self.cfg.inference.temperature,
+            "top_p": self.cfg.inference.top_p,
+        }
+
+        for ns_param, oh_param in NS_TO_OPENHANDS_PARAM.items():
+            if getattr(self.cfg.inference, ns_param) is not None:
+                if oh_param is not None:
+                    config["llm"]["model"][oh_param] = getattr(self.cfg.inference, ns_param)
+                else:
+                    supported_params = [key for key, value in NS_TO_OPENHANDS_PARAM.items() if value is not None]
+                    raise ValueError(
+                        f"Inference parameter {ns_param} is not supported by OpenHands. "
+                        f"Supported inference parameters: temperature, top_p, {', '.join(supported_params)}."
+                    )
+
+        config_str = tomlkit.dumps(config)
+
+        openhands_cmd = (
+            # make sure /workspace isn't mounted as a safety precaution
+            # (mounting it in the nemo-skills cluster config is ok, just not inside of apptainer specifically)
+            "if [ -d /workspace ]; then "
+            "    echo 'Exiting because /workspace is mounted.' && "
+            "    echo 'Please make sure /workspace is not mounted inside of Apptainer before running OpenHands.' && "
+            "    echo 'This is because OpenHands DELETES EVERYTHING in the /workspace folder if it exists.' && "
+            "    exit 1; "
+            "fi && "
+            # install openhands repo + dependencies
+            "cd /root && "
+            "curl -L -O \"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh\" && "
+            "bash Miniforge3-$(uname)-$(uname -m).sh -b && "
+            "eval \"$(/root/miniforge3/bin/conda shell.bash hook)\" && "
+            "mamba install -y --override-channels conda-forge::python=3.12 conda-forge::nodejs conda-forge::poetry conda-forge::tmux && "
+            "git clone https://github.com/All-Hands-AI/OpenHands.git && "
+            "cd OpenHands && "
+            "export INSTALL_DOCKER=0 && "
+            "make build && "
+            "poetry run python -m pip install datasets && "
+            # set up config files
+            f"echo {shlex.quote(config_str)} >config.toml && "
+            f"echo \"selected_ids = ['{data_point['instance_id']}']\" >evaluation/benchmarks/swe_bench/config.toml && "
+            # set local runtime & force verbose logs
+            "export RUNTIME=local && "
+            "export LOG_ALL_EVENTS=true && "
+            "export LOG_LEVEL=DEBUG && "
+            # run the agent
+            f"./evaluation/benchmarks/swe_bench/scripts/run_infer.sh "
+            f"    llm.model "  # name of llm config section in config.toml
+            f"    HEAD "  # openhands commit
+            f"    CodeActAgent "  # agent
+            f"    1 "  # number of instances
+            f"    {self.cfg.agent_max_turns} "  # max agent iterations
+            f"    1 "  # number of workers
+            f"    {data_point['dataset_name']} "  # dataset name
+            f"    {data_point['split']} && "  # dataset split
+            # move outputs to the mounted directory
+            f"mkdir -p /trajectories_mount/trajectories && "
+            f"cp -r evaluation/evaluation_outputs/outputs/*/*/* /trajectories_mount/trajectories/{data_point['instance_id']}"
+        )
+
+        # Execute OpenHands command
+        search_path = os.path.join(self.output_dir / "trajectories", "**", data_point['instance_id'], "output.jsonl")
+        out_file = await self._execute_container_command(data_point, openhands_cmd, search_path, mode="agent")
+
+        with open(out_file, "r") as f:
+            out_dict = json.loads(f.read().strip())
+
+        patch = out_dict["test_result"]["git_patch"]
+        if not patch:
+            patch = None
+
+        # Create file in the SWE-bench evaluation format
+        pred_file = out_file.replace("output.jsonl", "output_for_eval.jsonl")
+        with open(pred_file, "w") as f:
+            f.write(
+                json.dumps(
+                    {
+                        "model_name_or_path": out_dict["metadata"]["llm_config"]["model"],
+                        "instance_id": out_dict["instance_id"],
+                        "model_patch": patch,
+                    }
+                )
+            )
+        return pred_file
+
+    async def process_single_datapoint(self, data_point, data):
+        """Will do all necessary generations to get a single answer for the data point."""
+        self.output_dir = Path(self.cfg.output_file).parent
+
+        # TODO: what's the right way to support api models, so that our standard parameters for that can be used?
+        # TODO: use self.cfg.server.base_url, etc. Can we pass in API key?
+
+        if 'base_url' in self.cfg.server:
+            api_base = self.cfg.server.base_url
+        else:
+            api_base = f"http://{self.cfg.server.host}:{self.cfg.server.port}/v1"
+
+        if self.cfg.agent_framework == SupportedAgentFrameworks.swe_agent:
+            pred_file = await self._run_swe_agent(data_point, api_base)
+        elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands:
+            pred_file = await self._run_openhands(data_point, api_base)
+        else:
+            raise ValueError(
+                f"Unsupported agent framework: {self.cfg.agent_framework}. "
+                f"Supported frameworks: {', '.join(SupportedAgentFrameworks)}."
+            )
+
+        pred_mounted_path = pred_file.replace(str(self.output_dir), "/trajectories_mount")
+        with open(pred_file, "r") as f:
+            trajectory_dict = json.loads(f.read())
+
+        # Check if the trajectory has an empty patch before running evaluation
+        has_patch = trajectory_dict['model_patch'] is not None
+
+        if not has_patch:
+            report_json = {
+                data_point['instance_id']: {
+                    "resolved": False,
+                    "patch_exists": False,
+                    "patch_successfully_applied": False,
+                }
+            }
+        else:
+            # Run full evaluation with streaming output
+            swe_bench_cmd = (
+                # first installing SWE-bench repo
+                "curl -LsSf https://astral.sh/uv/install.sh | sh && "
+                "source /root/.local/bin/env && "
+                "cd /root && "
+                "git clone https://github.com/Kipok/SWE-bench.git && "
+                "cd SWE-bench && "
+                "uv venv --python 3.12 venv && "
+                "source venv/bin/activate && "
+                "uv pip install -e . && "
+                # then running the evaluation with streaming output
+                f"/root/SWE-bench/venv/bin/python -m swebench.harness.run_local_evaluation "
+                f"    --predictions_path {pred_mounted_path} "
+                f"    --instance_ids {data_point['instance_id']} "
+                f"    --run_id eval-outputs "
+                f"    --timeout {self.cfg.swebench_tests_timeout} "
+                f"    --dataset_name {data_point['dataset_name']} "
+                f"    --split {data_point['split']} && "
+                f"cp -r logs/run_evaluation/eval-outputs /trajectories_mount/"
+            )
+
+            # Execute SWE-bench evaluation command
+            search_path = os.path.join(
+                self.output_dir, "eval-outputs", "**", f"{data_point['instance_id']}/report.json"
+            )
+            # TODO: should we fail on errors here? Seems that json isn't always generated
+            try:
+                report_file = await self._execute_container_command(
+                    data_point,
+                    swe_bench_cmd,
+                    search_path,
+                    mode="eval",
+                    timeout=self.cfg.swebench_tests_timeout + 120,
+                )
+            except ValueError:
+                LOG.error("Failed to execute SWE-bench evaluation command for %s", data_point['instance_id'])
+                report_json = {
+                    data_point['instance_id']: {
+                        "resolved": False,
+                        "patch_exists": True,
+                        "patch_successfully_applied": False,
+                    }
+                }
+                report_file = None
+
+            if report_file is not None:
+                with open(report_file, 'r') as f:
+                    report_json = json.loads(f.read().strip())
+
+        output_dict = {
+            "swe-bench-metrics": report_json[data_point['instance_id']],
+            "swe-bench-outputs": trajectory_dict,
+            "generation": "",  # required TODO: we should fix this
+        }
+
+        return output_dict
+
+
+GENERATION_TASK_CLASS = SweBenchGenerationTask
+
+
+# Update the hydra main to use the class method
+@hydra.main(version_base=None, config_name='base_swebench_generation_config')
+def swebench_generation(cfg: SweBenchGenerationConfig):
+    cfg = SweBenchGenerationConfig(_init_nested=True, **cfg)
+    LOG.info("Config used: %s", cfg)
+
+    task = SweBenchGenerationTask(cfg)
+    task.generate()
+
+
+HELP_MESSAGE = get_help_message(
+    SweBenchGenerationConfig,
+    server_params=server_params(),
+)
+
+if __name__ == "__main__":
+    if '--help' in sys.argv or '-h' in sys.argv:
+        print(HELP_MESSAGE)
+    else:
+        setup_logging()
+        swebench_generation()
diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py
index 48bd223915..a8b38e2c4b 100644
--- a/nemo_skills/pipeline/utils/eval.py
+++ b/nemo_skills/pipeline/utils/eval.py
@@ -106,9 +106,11 @@ def get_benchmark_args_from_module(
                 "Did you forget to run prepare data commands?"
             )
 
-    prompt_config = get_arg_from_module_or_dict(benchmark_module, "PROMPT_CONFIG", override_dict=override_dict)
+    # this is deprecated, should remove in the future
+    prompt_config = get_arg_from_module_or_dict(benchmark_module, "PROMPT_CONFIG", "", override_dict=override_dict)
     generation_args = get_arg_from_module_or_dict(benchmark_module, "GENERATION_ARGS", "", override_dict=override_dict)
-    generation_args = f"++prompt_config={prompt_config} {generation_args}"
+    if prompt_config:
+        generation_args = f"++prompt_config={prompt_config} {generation_args}"
     requires_sandbox = get_arg_from_module_or_dict(benchmark_module, "REQUIRES_SANDBOX", False, override_dict)
 
     generation_module = get_arg_from_module_or_dict(
@@ -135,6 +137,12 @@ def get_benchmark_args_from_module(
         eval_subfolder += f"{benchmark_group}/"
     eval_subfolder += benchmark
 
+    # when running locally swe-bench launches apptainer inside docker and this required elevated privileges
+    # TODO: is there a better way to handle this?
+    if benchmark == "swe-bench" and cluster_config['executor'] == 'local':
+        LOG.info("Swe-bench requires extra docker privileges, setting NEMO_SKILLS_PRIVILEGED_DOCKER=1")
+        os.environ['NEMO_SKILLS_PRIVILEGED_DOCKER'] = '1'
+
     return BenchmarkArgs(
         name=benchmark,
         input_file=input_file,
@@ -228,7 +236,7 @@ def prepare_eval_commands(
     if generation_type is not None:
         if generation_module is not None:
             raise ValueError("Cannot specify both generation_module and generation_type. ")
-        
+
         generation_module = GENERATION_MODULE_MAP[generation_type]
 
     benchmarks_or_groups = {
diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index f0b4e35037..df992b6a09 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -187,6 +187,7 @@ def get_executor(
             ipc_mode="host",
             volumes=mounts,
             ntasks_per_node=1,
+            privileged=bool(os.getenv('NEMO_SKILLS_PRIVILEGED_DOCKER', 0)),
             # locally we are always asking for all GPUs to be able to select a subset with CUDA_VISIBLE_DEVICES
             num_gpus=-1 if gpus_per_node is not None else None,
             network="host",
@@ -642,6 +643,5 @@ def get_nsight_cmd(profile_step_range):
             f'export LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/lib/x86_64-linux-gnu" && '
             f"export NRL_NSYS_PROFILE_STEP_RANGE={profile_step_range} && "
             'export NRL_NSYS_WORKER_PATTERNS="*policy*,*vllm*" && '
-
-            )
-    return cmd
\ No newline at end of file
+        )
+    return cmd
diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py
index c122a240cc..907901e2fb 100644
--- a/nemo_skills/pipeline/utils/generation.py
+++ b/nemo_skills/pipeline/utils/generation.py
@@ -317,8 +317,8 @@ def configure_client(
             "server_port": server_port,
         }
         extra_arguments = (
-            f"{extra_arguments} ++server.server_type={server_type} "
-            f"++server.host=localhost ++server.port={server_port} ++server.model={model} "
+            f"{extra_arguments} ++server.server_type={server_type} ++server.host=127.0.0.1 "
+            f"++server.port={server_port} ++server.model={model} "
         )
     else:  # model is hosted elsewhere
         server_config = None
diff --git a/nemo_skills/prompt/config/eval/swe-bench/openhands/default.toml b/nemo_skills/prompt/config/eval/swe-bench/openhands/default.toml
new file mode 100644
index 0000000000..6f1a858c18
--- /dev/null
+++ b/nemo_skills/prompt/config/eval/swe-bench/openhands/default.toml
@@ -0,0 +1,7 @@
+[llm.model]
+# The following parameters are overridden by Nemo-Skills:
+# model, base_url, temperature, top_p.
+# Specifying them here will have no effect! Use Nemo-Skills options instead.
+api_key = "EMPTY"
+custom_llm_provider = "openai"
+native_tool_calling = true
diff --git a/nemo_skills/prompt/config/eval/swe-bench/swe-agent/default.yaml b/nemo_skills/prompt/config/eval/swe-bench/swe-agent/default.yaml
new file mode 100644
index 0000000000..9d003c2ee1
--- /dev/null
+++ b/nemo_skills/prompt/config/eval/swe-bench/swe-agent/default.yaml
@@ -0,0 +1,77 @@
+# Based on the default config from the SWE-agent repo:
+# https://github.com/SWE-agent/SWE-agent/blob/1375ec4fa69d300b432b9ca61d6b0e5d7259131c/config/default.yaml
+
+# note that this doesn't use nemo-skills prompt logic and instead is passed directly to swe-agent
+
+agent:
+  templates:
+    system_template: |-
+      You are a helpful assistant that can interact with a computer to solve tasks.
+    instance_template: |-
+      <uploaded_files>
+      {{working_dir}}
+      </uploaded_files>
+      I've uploaded a python code repository in the directory {{working_dir}}. Consider the following PR description:
+
+      <pr_description>
+      {{problem_statement}}
+      </pr_description>
+
+      Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?
+      I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+      Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the <pr_description> is satisfied.
+      Follow these steps to resolve the issue:
+      1. As a first step, it might be a good idea to find and read code relevant to the <pr_description>
+      2. Create a script to reproduce the error and execute it with `python <filename.py>` using the bash tool, to confirm the error
+      3. Edit the sourcecode of the repo to resolve the issue
+      4. Rerun your reproduce script and confirm that the error is fixed!
+      5. Think about edgecases and make sure your fix handles them as well
+      Your thinking should be thorough and so it's fine if it's very long.
+    next_step_template: |-
+      OBSERVATION:
+      {{observation}}
+    next_step_no_output_template: |-
+      Your command ran successfully and did not produce any output.
+  tools:
+    env_variables:
+      PAGER: cat
+      MANPAGER: cat
+      LESS: -R
+      PIP_PROGRESS_BAR: 'off'
+      TQDM_DISABLE: '1'
+      GIT_PAGER: cat
+    bundles:
+      - path: tools/registry
+      - path: tools/edit_anthropic
+      - path: tools/review_on_submit_m
+    registry_variables:
+      USE_FILEMAP: 'true'
+      SUBMIT_REVIEW_MESSAGES:
+        - |
+          Thank you for your work on this issue. Please carefully follow the steps below to help review your changes.
+
+          1. If you made any changes to your code after running the reproduction script, please run the reproduction script again.
+            If the reproduction script is failing, please revisit your changes and make sure they are correct.
+            If you have already removed your reproduction script, please ignore this step.
+          2. Remove your reproduction script (if you haven't done so already).
+          3. If you have modified any TEST files, please revert them to the state they had before you started fixing the issue.
+            You can do this with `git checkout -- /path/to/test/file.py`. Use below <diff> to find the files you need to revert.
+          4. Run the submit command again to confirm.
+
+          Here is a list of all of your changes:
+
+          <diff>
+          {{diff}}
+          </diff>
+    enable_bash_tool: true
+    parse_function:
+      type: function_calling
+  history_processors: []
+  model:
+    # The following parameters are overridden by Nemo-Skills:
+    # name, api_base, temperature, top_p, completion_kwargs, per_instance_call_limit.
+    # Specifying them here will have no effect! Use Nemo-Skills options instead.
+    per_instance_cost_limit: 0
+    total_cost_limit: 0
+    max_input_tokens: 0
+    max_output_tokens: 0
diff --git a/nemo_skills/prompt/config/eval/swe-bench/swe-agent/swe-agent-lm-32b.yaml b/nemo_skills/prompt/config/eval/swe-bench/swe-agent/swe-agent-lm-32b.yaml
new file mode 100644
index 0000000000..78462b6974
--- /dev/null
+++ b/nemo_skills/prompt/config/eval/swe-bench/swe-agent/swe-agent-lm-32b.yaml
@@ -0,0 +1,123 @@
+# Based on the config for SWE-agent-LM-32B from the SWE-smith repo:
+# https://github.com/SWE-bench/SWE-smith/blob/057f0478b6918bfcd89a51ceeec7229c60bb1028/agent/swesmith_infer.yaml
+
+# note that this doesn't use nemo-skills prompt logic and instead is passed directly to swe-agent
+
+agent:
+  templates:
+    system_template: |-
+      You are a helpful assistant that can interact with a computer to solve tasks.
+      <IMPORTANT>
+      * If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
+      </IMPORTANT>
+
+      You have access to the following functions:
+
+      ---- BEGIN FUNCTION #1: bash ----
+      Description: Execute a bash command in the terminal.
+
+      Parameters:
+        (1) command (string, required): The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.
+      ---- END FUNCTION #1 ----
+
+      ---- BEGIN FUNCTION #2: submit ----
+      Description: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.
+      No parameters are required for this function.
+      ---- END FUNCTION #2 ----
+
+      ---- BEGIN FUNCTION #3: str_replace_editor ----
+      Description: Custom editing tool for viewing, creating and editing files
+      * State is persistent across command calls and discussions with the user
+      * If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
+      * The `create` command cannot be used if the specified `path` already exists as a file
+      * If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
+      * The `undo_edit` command will revert the last edit made to the file at `path`
+
+      Notes for using the `str_replace` command:
+      * The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
+      * If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
+      * The `new_str` parameter should contain the edited lines that should replace the `old_str`
+
+      Parameters:
+        (1) command (string, required): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
+      Allowed values: [`view`, `create`, `str_replace`, `insert`, `undo_edit`]
+        (2) path (string, required): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
+        (3) file_text (string, optional): Required parameter of `create` command, with the content of the file to be created.
+        (4) old_str (string, optional): Required parameter of `str_replace` command containing the string in `path` to replace.
+        (5) new_str (string, optional): Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
+        (6) insert_line (integer, optional): Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
+        (7) view_range (array, optional): Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
+      ---- END FUNCTION #3 ----
+
+
+      If you choose to call a function ONLY reply in the following format with NO suffix:
+
+      Provide any reasoning for the function call here.
+      <function=example_function_name>
+      <parameter=example_parameter_1>value_1</parameter>
+      <parameter=example_parameter_2>
+      This is the value for the second parameter
+      that can span
+      multiple lines
+      </parameter>
+      </function>
+
+      <IMPORTANT>
+      Reminder:
+      - Function calls MUST follow the specified format, start with <function= and end with </function>
+      - Required parameters MUST be specified
+      - Only call one function at a time
+      - Always provide reasoning for your function call in natural language BEFORE the function call (not after)
+      </IMPORTANT>
+    instance_template: |-
+      <uploaded_files>
+      {{working_dir}}
+      </uploaded_files>
+      I've uploaded a python code repository in the directory {{working_dir}}. Consider the following PR description:
+
+      <pr_description>
+      {{problem_statement}}
+      </pr_description>
+
+      Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?
+      I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+      Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the <pr_description> is satisfied.
+      Follow these steps to resolve the issue:
+      1. As a first step, it might be a good idea to find and read code relevant to the <pr_description>
+      2. Create a script to reproduce the error and execute it with `python <filename.py>` using the bash tool, to confirm the error
+      3. Edit the source code of the repo to resolve the issue
+      4. Rerun your reproduce script and confirm that the error is fixed!
+      5. Think about edgecases and make sure your fix handles them as well
+      Your thinking should be thorough and so it's fine if it's very long.
+    next_step_template: |-
+      OBSERVATION:
+      {{observation}}
+    next_step_no_output_template: |-
+      Your command ran successfully and did not produce any output.
+    max_observation_length: 70000
+  tools:
+    bundles:
+      - path: tools/registry
+      - path: tools/edit_anthropic
+      - path: tools/submit
+    env_variables:
+      USE_FILEMAP: 'true'
+    enable_bash_tool: true
+    parse_function:
+      type: xml_function_calling
+    str_replace_editor:
+      arguments:
+      - name: view_range
+        argument_format: "--view_range {{value}}"
+    execution_timeout: 300
+  history_processors:
+    - type: last_n_observations
+      n: 5
+  model:
+    # The following parameters are overridden by Nemo-Skills:
+    # name, api_base, temperature, top_p, completion_kwargs, per_instance_call_limit.
+    # Specifying them here will have no effect! Use Nemo-Skills options instead.
+    per_instance_cost_limit: 0
+    total_cost_limit: 0
+    max_input_tokens: 0
+    max_output_tokens: 0
diff --git a/nemo_skills/prompt/utils.py b/nemo_skills/prompt/utils.py
index 2fa0624679..8078d2133c 100644
--- a/nemo_skills/prompt/utils.py
+++ b/nemo_skills/prompt/utils.py
@@ -17,9 +17,9 @@
 import random
 import re
 from dataclasses import asdict, field
+from itertools import zip_longest
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-from itertools import zip_longest
 
 import yaml
 
@@ -244,7 +244,7 @@ def fill(
                 and use it to construct the prompt. You input_dict should also have "assistant" key in all
                 turns except last containing assistant reply.
             return_templated_dict: Indicates whether to return a messages list where the template is used
-                to fill the prompt. If so, a list of dicts with 'role' and 'content' keys will be returned. 
+                to fill the prompt. If so, a list of dicts with 'role' and 'content' keys will be returned.
                 In this case the final user and assistant messages will include special tokens.
 
         Returns:
@@ -259,12 +259,16 @@ def fill(
 
         if self.config.template:
             if multi_turn_key is None:
-                prompt_string = (system_string := self.SYSTEM_FORMAT.format(
-                    system=self.config.system.format(**input_dict), **asdict(self.config.template)
-                ))
-                prompt_string += (user_string := self.TURN_BEGIN_FORMAT.format(
-                    user=self.build_user_message(input_dict), **asdict(self.config.template)
-                ))
+                prompt_string = (
+                    system_string := self.SYSTEM_FORMAT.format(
+                        system=self.config.system.format(**input_dict), **asdict(self.config.template)
+                    )
+                )
+                prompt_string += (
+                    user_string := self.TURN_BEGIN_FORMAT.format(
+                        user=self.build_user_message(input_dict), **asdict(self.config.template)
+                    )
+                )
                 user_strings = [user_string]
                 assistant_strings = []
                 if generation:
@@ -273,30 +277,40 @@ def fill(
                         # Append generation without the closing tag.
                         prompt_string += (assistant_string := generation)
                     else:
-                        prompt_string += (assistant_string := self.TURN_END_FORMAT.format(
-                            assistant=generation, **asdict(self.config.template)
-                        ))
+                        prompt_string += (
+                            assistant_string := self.TURN_END_FORMAT.format(
+                                assistant=generation, **asdict(self.config.template)
+                            )
+                        )
                     assistant_strings.append(assistant_string)
 
             else:
-                prompt_string = (system_string := self.SYSTEM_FORMAT.format(
-                    system=self.config.system.format(**input_dict), **asdict(self.config.template)
-                ))
+                prompt_string = (
+                    system_string := self.SYSTEM_FORMAT.format(
+                        system=self.config.system.format(**input_dict), **asdict(self.config.template)
+                    )
+                )
                 user_strings = []
                 assistant_strings = []
                 for turn in input_dict[multi_turn_key][:-1]:
-                    prompt_string += (user_string := self.TURN_BEGIN_FORMAT.format(
-                        user=self.build_user_message(turn), **asdict(self.config.template)
-                    ))
+                    prompt_string += (
+                        user_string := self.TURN_BEGIN_FORMAT.format(
+                            user=self.build_user_message(turn), **asdict(self.config.template)
+                        )
+                    )
                     user_strings.append(user_string)
-                    prompt_string += (assistant_string := self.TURN_END_FORMAT.format(
-                        assistant=turn["assistant"], **asdict(self.config.template)
-                    ))
+                    prompt_string += (
+                        assistant_string := self.TURN_END_FORMAT.format(
+                            assistant=turn["assistant"], **asdict(self.config.template)
+                        )
+                    )
                     assistant_strings.append(assistant_string)
 
-                prompt_string += (user_string := self.TURN_BEGIN_FORMAT.format(
-                    user=self.build_user_message(input_dict[multi_turn_key][-1]), **asdict(self.config.template)
-                ))
+                prompt_string += (
+                    user_string := self.TURN_BEGIN_FORMAT.format(
+                        user=self.build_user_message(input_dict[multi_turn_key][-1]), **asdict(self.config.template)
+                    )
+                )
                 user_strings.append(user_string)
                 prompt_string += generation
                 if generation:
@@ -351,6 +365,20 @@ def __str__(self):
         return str(self.config)
 
 
+def get_config_path(config: str, config_dir: str | None = None, config_extension: str = "yaml") -> Path:
+    if config_dir is None:
+        config_dir = str(Path(__file__).parent.absolute() / 'config')
+
+    if config.endswith(f".{config_extension}"):
+        config_path = Path(config).absolute()
+    elif config.startswith("nemo_skills"):
+        config_path = Path(__file__).parents[2].absolute() / f"{config}.{config_extension}"
+    else:
+        config_path = Path(config_dir) / f"{config}.{config_extension}"
+
+    return config_path
+
+
 def load_config(config: str, config_dir: str | None = None) -> dict:
     """
     Reads the prompt config/template from the yaml file.
@@ -365,15 +393,7 @@ def load_config(config: str, config_dir: str | None = None) -> dict:
     Returns:
         The loaded dictionary.
     """
-    if config_dir is None:
-        config_dir = str(Path(__file__).parent.absolute() / 'config')
-
-    if config.endswith(".yaml"):
-        config_path = Path(config).absolute()
-    elif config.startswith("nemo_skills"):
-        config_path = Path(__file__).parents[2].absolute() / f"{config}.yaml"
-    else:
-        config_path = Path(config_dir) / f"{config}.yaml"
+    config_path = get_config_path(config, config_dir)
 
     with open(config_path, "rt", encoding="utf-8") as fin:
         return yaml.safe_load(fin)
@@ -412,7 +432,7 @@ def get_prompt(
         else:
             code_tags_dict = code_tags
         code_tags_obj = CodeTags(**code_tags_dict)
-    
+
     prompt = Prompt(PromptConfig(**config, template=template_obj, code_tags=code_tags_obj))
 
     if examples_type is not None:
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index ae4858d1e5..2a6cccb396 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -61,7 +61,6 @@
 def test_dataset_init_defaults():
     for dataset, _ in DATASETS:
         dataset_module = importlib.import_module(f"nemo_skills.dataset.{dataset}")
-        assert hasattr(dataset_module, 'PROMPT_CONFIG'), f"{dataset} is missing PROMPT_CONFIG attribute"
         assert hasattr(dataset_module, 'DATASET_GROUP'), f"{dataset} is missing DATASET_GROUP attribute"
         assert dataset_module.DATASET_GROUP in [
             "math",
diff --git a/tests/test_generation.py b/tests/test_generation.py
index 004b92b2a2..b8aaf7f0b1 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -37,12 +37,10 @@ def test_generation_dryrun_llama(dataset, split):
     """Testing the default prompts for each dataset."""
     prompt_template = "llama3-instruct"
     extra_args = importlib.import_module(f'nemo_skills.dataset.{dataset}').GENERATION_ARGS
-    prompt_config = importlib.import_module(f'nemo_skills.dataset.{dataset}').PROMPT_CONFIG
     cmd = (
         "python nemo_skills/inference/generate.py "
         f"    ++output_file=./test.jsonl "
         f"    ++prompt_template={prompt_template} "
-        f"    ++prompt_config={prompt_config} "
         f"    ++input_file=./nemo_skills/dataset/{dataset}/{split}.jsonl "
         f"    ++server.server_type=sglang "
         f"    ++server.model=dummy "