NVIDIA-NeMo · Kipok · Aug 15, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 25, 2025
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Here are some of the features we support:
   - Evaluate your models on many popular benchmarks.
     - Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
     - Formal proofs in Lean: minif2f, proofnet
-    - Coding skills: scicode, livecodebench, human-eval, mbpp
+    - Coding skills: swe-bench, scicode, livecodebench, human-eval, mbpp
     - Chat/instruction following: ifbench, ifeval, arena-hard
     - General knowledge: mmlu, mmlu-pro, gpqa
     - Long context: ruler, mrcr

diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills
@@ -29,4 +29,4 @@ RUN mkdir -p /opt/NeMo-Skills/requirements
 COPY pyproject.toml README.md /opt/NeMo-Skills/
 COPY nemo_skills /opt/NeMo-Skills/nemo_skills/
 COPY requirements /opt/NeMo-Skills/requirements/
-RUN cd /opt/NeMo-Skills && pip install -e .[all]
+RUN cd /opt/NeMo-Skills && pip install -e .[all]
diff --git a/docs/index.md b/docs/index.md
@@ -16,7 +16,7 @@ Here are some of the features we support:
     - Evaluate your models on many popular benchmarks.
         - Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
         - Formal proofs in Lean: minif2f, proofnet
-        - Coding skills: scicode, livecodebench, human-eval, mbpp
+        - Coding skills: swe-bench, scicode, livecodebench, human-eval, mbpp
         - Chat/instruction following: ifbench, ifeval, arena-hard
         - General knowledge: mmlu, mmlu-pro, gpqa
         - Long context: ruler

diff --git a/docs/pipelines/evaluation.md b/docs/pipelines/evaluation.md
@@ -182,11 +182,10 @@ Inside [nemo_skills/dataset/gsm8k/\_\_init\_\_.py](https://github.com/NVIDIA/NeM
 
 ```python
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
 ```
 
 The prompt config and default generation arguments are passed to the

diff --git a/nemo_skills/dataset/aime24/__init__.py b/nemo_skills/dataset/aime24/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = f"++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/aime25/__init__.py b/nemo_skills/dataset/aime25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/algebra222/__init__.py b/nemo_skills/dataset/algebra222/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/amc23/__init__.py b/nemo_skills/dataset/amc23/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/answer-judge/__init__.py b/nemo_skills/dataset/answer-judge/__init__.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'judge/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "answer-judgement"
-EVAL_ARGS = "++eval_type=answer_judgement ++generation_key=judgement"
-GENERATION_ARGS = "++generation_key=judgement"
+# using judgement directly in metrics, no need for special evaluation
+EVAL_ARGS = "++eval_type=no-op ++generation_key=judgement"
+GENERATION_ARGS = "++prompt_config=judge/math ++generation_key=judgement"
diff --git a/nemo_skills/dataset/arena-hard/__init__.py b/nemo_skills/dataset/arena-hard/__init__.py
@@ -14,11 +14,10 @@
 
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/default'
 DATASET_GROUP = 'chat'
 METRICS_TYPE = "arena"
-EVAL_ARGS = "++eval_type=arena"
-GENERATION_ARGS = ""
+EVAL_ARGS = "++eval_type=no-op"  # using judgement directly in metrics, no need for special evaluation
+GENERATION_ARGS = "++prompt_config=generic/default"
 
 JUDGE_PIPELINE_ARGS = {
     "generation_module": "nemo_skills.inference.eval.arena_judge",

diff --git a/nemo_skills/dataset/asdiv/__init__.py b/nemo_skills/dataset/asdiv/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/bfcl_v3/prepare.py b/nemo_skills/dataset/bfcl_v3/prepare.py
@@ -12,17 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import subprocess
-import os
+import argparse
 import glob
-import tempfile
 import json
+import logging
+import os
 import shutil
-from nemo_skills.dataset.bfcl_v3.utils import func_doc_language_specific_pre_processing, convert_to_tool, is_multi_turn, load_file
+import subprocess
+import tempfile
 from pathlib import Path
-from nemo_skills.dataset.bfcl_v3.constants import DATA_FOLDER_PATH, MULTI_TURN_FUNC_DOC_PATH, MULTI_TURN_FUNC_DOC_FILE_MAPPING
-import argparse
-import logging
+
+from nemo_skills.dataset.bfcl_v3.constants import (
+    DATA_FOLDER_PATH,
+    MULTI_TURN_FUNC_DOC_FILE_MAPPING,
+    MULTI_TURN_FUNC_DOC_PATH,
+)
+from nemo_skills.dataset.bfcl_v3.utils import (
+    convert_to_tool,
+    func_doc_language_specific_pre_processing,
+    is_multi_turn,
+    load_file,
+)
 from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
@@ -34,7 +44,6 @@
 
 # Define the configuration as a dictionary
 DEFAULT_SETTINGS = """
-PROMPT_CONFIG = "null"
 DATASET_GROUP = "tool"
 METRICS_TYPE = "bfcl"
 EVAL_ARGS = "++eval_type=bfcl"
@@ -48,7 +57,7 @@ def process_multi_turn_test_case(instance, repo_root_dir):
     """
     Multi-turn test cases don't have the function doc in the prompt. We need to add them here.
     """
-    # Mark whether the instance is single-turn or multi-turn. 
+    # Mark whether the instance is single-turn or multi-turn.
     # This is used to determine if the inference should be done in a single turn or multiple turns.
     if not is_multi_turn(instance["id"]):
         instance["single_turn"] = True
@@ -92,54 +101,54 @@ def process_file(repo_root_dir, input_file, output_file, model_type="llama-nemot
             test_category = instance["id"].rsplit("_", 1)[0]
             if idx == 0:
                 LOG.info(f"Processing {test_category}")
-            
+
             # TODO: Current preprocessing can be model dependent. This could be moved to inference time as well
             # Convert class-based method calls to function calls
             instance = process_multi_turn_test_case(instance, repo_root_dir)
-            
+
             # Convert function calls to tools format and add them to the system prompt
             if "function" in instance:
                 # Add the tools to the system prompt
                 instance["function"] = func_doc_language_specific_pre_processing(instance["function"], test_category)
                 instance["tools"] = convert_to_tool(instance["function"])
-                
+
             f_out.write(json.dumps(instance) + "\n")
 
 
 def download_and_process_bfcl_data(repo_url, subfolder_path, output_dir, file_prefix="BFCL_v3", model_type="nemotron"):
     """
     Download JSON files from the BFCL GitHub repo via cloning
-    
+
     Args:
         repo_url: GitHub repository URL
         subfolder_path: Path to the data subfolder in case of BFCL
         output_dir: Directory to save the processed JSONL files
         file_prefix: Only process files starting with this prefix
-        model_type: Formatting of functions and tools can be model dependent. 
+        model_type: Formatting of functions and tools can be model dependent.
     """
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
             # Clone repository with minimal depth
             print(f"Cloning repository {repo_url} to {temp_dir}")
-            subprocess.run([
-                "git", "clone", "--depth=1", repo_url, temp_dir
-            ], check=True, capture_output=True)
-
+            subprocess.run(["git", "clone", "--depth=1", repo_url, temp_dir], check=True, capture_output=True)
+
             # Find the target folder
             target_folder = Path(temp_dir) / subfolder_path
-            
+
             if not os.path.exists(target_folder):
                 print(f"Folder {subfolder_path} not found in repository")
-                raise FileNotFoundError(f"Folder {subfolder_path} not found in {repo_url} cloned to {temp_dir}. The structure of BFCL has changed!")
-
+                raise FileNotFoundError(
+                    f"Folder {subfolder_path} not found in {repo_url} cloned to {temp_dir}. The structure of BFCL has changed!"
+                )
+
             # Find JSON files matching criteria
             json_pattern = os.path.join(target_folder, f"{file_prefix}*.json")
             json_files = glob.glob(json_pattern)
-            
+
             print(f"Found {len(json_files)} JSON files matching pattern")
-            
+
             if not os.path.exists(output_dir):
-                os.makedirs(output_dir) 
+                os.makedirs(output_dir)
 
             processed_files = 0
             for input_file in json_files:
@@ -157,21 +166,21 @@ def download_and_process_bfcl_data(repo_url, subfolder_path, output_dir, file_pr
                 # Copy the original json file to the split directory
                 shutil.copy(input_file, os.path.join(split_dirname, filename))
                 processed_files += 1
-            
+
             print(f"Successfully processed {processed_files} JSON files to {output_dir}")
-            
+
         except subprocess.CalledProcessError as e:
             print(f"Git command failed: {e}")
             print("Make sure git is installed and the repository URL is correct")
 
 
 def main(args):
-    LOG.warning("Currently processing according to the OpenAI model style which works for most models, including Qwen/Llama-Nemotron/DeepSeek.")
+    LOG.warning(
+        "Currently processing according to the OpenAI model style which works for most models, including Qwen/Llama-Nemotron/DeepSeek."
+    )
 
     download_and_process_bfcl_data(
-        REPO_URL, DATA_FOLDER_PATH, 
-        output_dir=os.path.join(os.path.dirname(__file__)),
-        model_type=args.model_type
+        REPO_URL, DATA_FOLDER_PATH, output_dir=os.path.join(os.path.dirname(__file__)), model_type=args.model_type
     )
 
 
@@ -181,6 +190,3 @@ def main(args):
     args = parser.parse_args()
 
     main(args)
-
-
-
diff --git a/nemo_skills/dataset/brumo25/__init__.py b/nemo_skills/dataset/brumo25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/college_math/__init__.py b/nemo_skills/dataset/college_math/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/comp-math-24-25/__init__.py b/nemo_skills/dataset/comp-math-24-25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/gaokao2023en/__init__.py b/nemo_skills/dataset/gaokao2023en/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/gpqa/__init__.py b/nemo_skills/dataset/gpqa/__init__.py
@@ -15,9 +15,8 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "eval/aai/mcq-4choices-boxed"
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
 EVAL_ARGS = "++eval_type=multichoice"
 EVAL_SPLIT = "diamond"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-4choices-boxed"
diff --git a/nemo_skills/dataset/gsm-plus/__init__.py b/nemo_skills/dataset/gsm-plus/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/gsm8k/__init__.py b/nemo_skills/dataset/gsm8k/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/hle'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/hle"
 EVAL_SPLIT = "text"
 
 # Some answers are not possible to compare symbolically, so have to use a judge model

diff --git a/nemo_skills/dataset/hmmt_feb25/__init__.py b/nemo_skills/dataset/hmmt_feb25/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/math'
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
 EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/math"
diff --git a/nemo_skills/dataset/human-eval/__init__.py b/nemo_skills/dataset/human-eval/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/codegen'
 DATASET_GROUP = 'code'
-METRICS_TYPE = "code"
+METRICS_TYPE = "evalplus"
 EVAL_ARGS = "++eval_type=evalplus ++eval_config.dataset=humaneval"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/codegen"
diff --git a/nemo_skills/dataset/ifbench/__init__.py b/nemo_skills/dataset/ifbench/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/default'
 DATASET_GROUP = 'chat'
 METRICS_TYPE = "if"
 EVAL_ARGS = "++eval_type=ifbench ++generation_key=response"
-GENERATION_ARGS = "++generation_key=response"
+GENERATION_ARGS = "++generation_key=response ++prompt_config=generic/default"
diff --git a/nemo_skills/dataset/ifeval/__init__.py b/nemo_skills/dataset/ifeval/__init__.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-PROMPT_CONFIG = 'generic/default'
 DATASET_GROUP = 'chat'
 METRICS_TYPE = "if"
 EVAL_ARGS = "++eval_type=if ++generation_key=response"
-GENERATION_ARGS = "++generation_key=response"
+GENERATION_ARGS = "++prompt_config=generic/default ++generation_key=response"