NVIDIA-NeMo · gwarmstrong · Jan 29, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/nemo_skills/dataset/bfcl_v3/prepare.py b/nemo_skills/dataset/bfcl_v3/prepare.py
@@ -48,7 +48,6 @@ def ensure_bfcl_eval_installed():
                     "pip",
                     "install",
                     "--no-cache-dir",
-                    "-e",
                     str(repo_dir / BFCL_EVAL_SUBDIR),
                     "--extra-index-url",
                     BFCL_EXTRA_INDEX_URL,

diff --git a/nemo_skills/dataset/gsm-plus/prepare.py b/nemo_skills/dataset/gsm-plus/prepare.py
@@ -16,13 +16,11 @@
 import json
 import os
 import pathlib
-import urllib.request
 from pathlib import Path
 
-from nemo_skills.dataset.utils import add_rounding_instruction
-
-URL = "https://huggingface.co/datasets/qintongli/GSM-Plus/resolve/main/data/test-00000-of-00001.jsonl?download=true"
+from datasets import load_dataset
 
+from nemo_skills.dataset.utils import add_rounding_instruction
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -47,10 +45,9 @@
     split = "test"
     data_dir = Path(__file__).absolute().parent
     data_dir.mkdir(exist_ok=True)
-    original_file = str(data_dir / f"original_{split}.jsonl")
     output_file = str(data_dir / f"{split}.jsonl")
 
-    urllib.request.urlretrieve(URL, original_file)
+    dataset = load_dataset("qintongli/GSM-Plus", split="test")
 
     file_rounded = None
     if not args.no_rounding_instructions:
@@ -62,10 +59,9 @@
         for key in cleaning_options.keys():
             cleaning_options[key] = set(cleaning_options[key])
 
-    with open(original_file, "rt") as original, open(output_file, "w") as test_full:
-        original_data = [json.loads(line) for line in original.readlines()]
-        cleaning_options["none"] = set(range(len(original_data)))
-        for i, original_entry in enumerate(original_data):
+    cleaning_options["none"] = set(range(len(dataset)))
+    with open(output_file, "w") as test_full:
+        for i, original_entry in enumerate(dataset):
             if (
                 original_entry["perturbation_type"].replace(" ", "_") in args.categories
                 and i in cleaning_options[args.cleaning]
@@ -107,6 +103,3 @@
 
     if file_rounded:
         file_rounded.close()
-
-    # cleaning up
-    os.remove(original_file)
diff --git a/nemo_skills/dataset/scicode/prepare.py b/nemo_skills/dataset/scicode/prepare.py
@@ -13,47 +13,23 @@
 # limitations under the License.
 
 import json
-import os
-import urllib.request
 from pathlib import Path
 
-URL = "https://huggingface.co/datasets/SciCode1/SciCode/raw/main/problems_{split}.jsonl"
-
+from datasets import load_dataset
 
 if __name__ == "__main__":
     data_dir = Path(__file__).absolute().parent
-    for split in ["dev", "test"]:
-        original_file = str(data_dir / f"original_{split}.jsonl")
-        data_dir.mkdir(exist_ok=True)
-        output_file = str(data_dir / f"{split}.jsonl")
-
-        if not os.path.exists(original_file):
-            urllib.request.urlretrieve(URL.format(split=split), original_file)
-
-        data = []
-        with open(original_file, "rt", encoding="utf-8") as fin:
-            for line in fin:
-                entry = json.loads(line)
-                new_entry = entry  # TODO?
-                data.append(new_entry)
+    data_dir.mkdir(exist_ok=True)
 
-        with open(output_file, "wt", encoding="utf-8") as fout:
-            for entry in data:
-                fout.write(json.dumps(entry) + "\n")
+    dataset = load_dataset("SciCode1/SciCode")
 
-    # Concate the two to make test_aai
-    dev_file = data_dir / "dev.jsonl"
-    test_file = data_dir / "test.jsonl"
+    split_mapping = {"validation": "dev", "test": "test"}
     test_aai_file = data_dir / "test_aai.jsonl"
-
-    with open(dev_file, "rt", encoding="utf-8") as fin:
-        dev_data = [json.loads(line) for line in fin]
-    with open(test_file, "rt", encoding="utf-8") as fin:
-        test_data = [json.loads(line) for line in fin]
-
-    test_aai_data = []
-    test_aai_data.extend(dev_data)
-    test_aai_data.extend(test_data)
-    with open(test_aai_file, "w", encoding="utf-8") as fout:
-        for entry in test_aai_data:
-            fout.write(json.dumps(entry) + "\n")
+    with open(test_aai_file, "w", encoding="utf-8") as test_aai_fout:
+        for hf_split, output_split in split_mapping.items():
+            output_file = data_dir / f"{output_split}.jsonl"
+            with open(output_file, "wt", encoding="utf-8") as fout:
+                for entry in dataset[hf_split]:
+                    line = json.dumps(entry) + "\n"
+                    fout.write(line)
+                    test_aai_fout.write(line)
diff --git a/tests/slurm-tests/clone_and_run.sh b/tests/slurm-tests/clone_and_run.sh
@@ -13,8 +13,10 @@ git clone https://github.com/NVIDIA-NeMo/Skills.git NeMo-Skills
 cd NeMo-Skills
 
 curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=$LOCAL_WORKSPACE sh
-$LOCAL_WORKSPACE/uv venv .venv --python 3.10
+$LOCAL_WORKSPACE/uv venv .venv --python 3.10 --seed
 source .venv/bin/activate
+VENV_BIN="$(dirname "$(command -v python)")"
+export PATH="$VENV_BIN:$PATH"
 $LOCAL_WORKSPACE/uv pip install -e .
 
 ./tests/slurm-tests/run_all.sh $1
diff --git a/tests/slurm-tests/qwen3_4b_evals/check_results.py b/tests/slurm-tests/qwen3_4b_evals/check_results.py
@@ -23,14 +23,14 @@
 
 TOOLCALLING_METRIC_RANGES = {
     ("overall_accuracy", "accuracy"): (61.0, 67.0),
-    ("non_live_single_turn", "overall_non_live", "accuracy"): (84.0, 90.0),
-    ("non_live_single_turn", "non_live_ast", "accuracy"): (85.0, 92.0),
-    ("non_live_single_turn", "irrelevance", "accuracy"): (79.0, 86.0),
-    ("live_single_turn", "overall_live", "accuracy"): (76.0, 83.0),
-    ("live_single_turn", "live_ast", "accuracy"): (79.0, 86.0),
-    ("live_single_turn", "live_irrelevance", "accuracy"): (73.0, 80.0),
-    ("live_single_turn", "live_relevance", "accuracy"): (70.0, 90.0),  # unusually high variance
-    ("multi_turn", "overall_multi_turn", "accuracy"): (20.0, 30.0),
+    ("overall_non_live", "accuracy"): (84.0, 90.0),
+    ("non_live_ast", "accuracy"): (85.0, 92.0),
+    ("non_live_irrelevance", "accuracy"): (79.0, 86.0),
+    ("overall_live", "accuracy"): (76.0, 83.0),
+    ("live_ast", "accuracy"): (79.0, 86.0),
+    ("live_irrelevance", "accuracy"): (73.0, 80.0),
+    ("live_relevance", "accuracy"): (70.0, 90.0),  # unusually high variance
+    ("overall_multi_turn", "accuracy"): (20.0, 30.0),
 }
 
 
@@ -43,7 +43,7 @@
 
 def check_results(eval_dir: str):
     f = os.path.join(eval_dir, "eval-results", "bfcl_v3", "metrics.json")
-    data = load_json(f)
+    data = load_json(f)["bfcl_v3"]
     for category_tuple, expected_range in TOOLCALLING_METRIC_RANGES.items():
         val = float(get_nested_value(data, category_tuple))
         lo, hi = expected_range

diff --git a/tests/slurm-tests/qwen3_4b_evals/run_test.py b/tests/slurm-tests/qwen3_4b_evals/run_test.py
@@ -22,11 +22,12 @@ def eval_qwen3_bfcl(workspace, cluster, expname_prefix, wandb_project):
 
     eval(
         ctx=wrap_arguments(
-            f"++inference.temperature=0.6 "
-            f"++inference.top_p=0.95 "
-            f"++inference.tokens_to_generate=8192 "
-            f"++model_name={model} "
-            f"++parse_reasoning=True "
+            "++inference.temperature=0.6 "
+            "++inference.top_p=0.95 "
+            "++inference.tokens_to_generate=8192 "
+            # somehow Qwen3-4B was removed from default names, but 8b should be same parser
+            "++model_name=Qwen/Qwen3-8B-FC "
+            "++parse_reasoning=True "
         ),
         cluster=cluster,
         benchmarks="bfcl_v3",
@@ -57,7 +58,6 @@ def eval_qwen3_online_genselect(workspace, cluster, expname_prefix, wandb_projec
             "++parallel_thinking.mode=genselect "
             "++server.enable_soft_fail=True "
             "++server.context_limit_retry_strategy=reduce_generation "
-            # "++skip_filled=False "
         ),
         cluster=cluster,
         benchmarks="aime24:1",

diff --git a/tests/slurm-tests/super_49b_evals/check_results.py b/tests/slurm-tests/super_49b_evals/check_results.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 
 sys.path.append(str(Path(__file__).resolve().parent.parent))  # for utils.py
-from utils import assert_all, get_nested_value, load_json, soft_assert  # noqa: E402
+from utils import assert_all, load_json, soft_assert  # noqa: E402
 
 REASONING_TASKS = [
     "math-500",
@@ -67,7 +67,7 @@
         "aime25": (0.0, 10.0),
         "gpqa": (49.0, 56.0),
         "mmlu-pro": (68.0, 71.0),
-        "livecodebench": (27.5, 32.5),
+        "livecodebench": (26.0, 32.5),
         "scicode": {
             "problem_accuracy": (5.0, 10.0),
             "subtask_accuracy": (20.0, 28.0),
@@ -175,15 +175,6 @@ def check_reasoning(eval_dir: str, mode: str):
             soft_assert(lo <= val <= hi, f"{bench} ({mode}) {field}={val} out of range [{lo},{hi}]")
 
 
-def check_toolcalling(eval_dir: str, mode: str):
-    f = os.path.join(eval_dir, "eval-results", "bfcl_v3", "metrics.json")
-    data = load_json(f)
-    for cat, path in TOOLCALLING_METRIC_PATHS.items():
-        val = float(get_nested_value(data, path))
-        lo, hi = TOOLCALLING_METRIC_RANGES[mode][cat]
-        soft_assert(lo <= val <= hi, f"TOOL-CALLING ({mode}) {cat}={val} out of range [{lo},{hi}]")
-
-
 def check_ruler(eval_dir: str, mode: str):
     f = os.path.join(eval_dir, "eval-results", "ruler.nemotron_super_128k_slurm_ci", "metrics.json")
     data = load_json(f)
@@ -202,8 +193,6 @@ def main():
 
     check_reasoning(eval_root / "reasoning_off", "reasoning_off")
     check_reasoning(eval_root / "reasoning_on", "reasoning_on")
-    check_toolcalling(eval_root / "reasoning_on_tool_calling", "reasoning_on")
-    check_toolcalling(eval_root / "reasoning_off_tool_calling", "reasoning_off")
     check_ruler(eval_root / "reasoning_off_ruler", "reasoning_off")
 
     assert_all()

diff --git a/tests/slurm-tests/super_49b_evals/run_test.py b/tests/slurm-tests/super_49b_evals/run_test.py
@@ -142,31 +142,10 @@ def eval_reasoning_on(workspace, cluster, expname_prefix, wandb_project):
         wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-on",
     )
 
-    # BFCL (Reasoning ON)
-    eval(
-        ctx=wrap_arguments(f"{common_params} {tokens_to_generate} ++use_client_parsing=False"),
-        cluster=cluster,
-        benchmarks="bfcl_v3",
-        model=base_model,
-        server_gpus=8,
-        num_jobs=1,
-        server_type="vllm",
-        output_dir=f"{workspace}/reasoning_on_tool_calling",
-        server_args=(
-            f"--tool-parser-plugin {base_model}/llama_nemotron_toolcall_parser_no_streaming.py "
-            f"--tool-call-parser llama_nemotron_json --enable-auto-tool-choice --max-num-seqs=1024"
-        ),
-        run_after=f"{expname_prefix}-download-models",
-        expname=f"{expname_prefix}-bfcl-on",
-        wandb_project=wandb_project,
-        wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-on",
-    )
-
     return [
         f"{expname_prefix}-math-code-science-on",
         f"{expname_prefix}-livecode-on",
         f"{expname_prefix}-hle-on",
-        f"{expname_prefix}-bfcl-on",
     ]
 
 
@@ -265,25 +244,6 @@ def eval_reasoning_off(workspace, cluster, expname_prefix, wandb_project):
         wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-off",
     )
 
-    # BFCL (Reasoning OFF)
-    eval(
-        ctx=wrap_arguments(f"{common_params} {tokens_to_generate} ++use_client_parsing=False"),
-        cluster=cluster,
-        benchmarks="bfcl_v3",
-        model=base_model,
-        server_gpus=8,
-        server_type="vllm",
-        output_dir=f"{workspace}/reasoning_off_tool_calling",
-        server_args=(
-            f"--tool-parser-plugin {base_model}/llama_nemotron_toolcall_parser_no_streaming.py "
-            f"--tool-call-parser llama_nemotron_json --enable-auto-tool-choice --max-num-seqs=1024"
-        ),
-        run_after=f"{expname_prefix}-download-models",
-        expname=f"{expname_prefix}-bfcl-off",
-        wandb_project=wandb_project,
-        wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-off",
-    )
-
     # RULER (Reasoning OFF)
     eval(
         ctx=wrap_arguments(f"{common_params}"),
@@ -305,7 +265,6 @@ def eval_reasoning_off(workspace, cluster, expname_prefix, wandb_project):
         f"{expname_prefix}-math-code-science-off",
         f"{expname_prefix}-livecode-off",
         f"{expname_prefix}-hle-off",
-        f"{expname_prefix}-bfcl-off",
         f"{expname_prefix}-ruler-off",
     ]
 
@@ -320,7 +279,7 @@ def main():
     args = parser.parse_args()
 
     prepare_data(
-        ctx=wrap_arguments("gpqa mmlu-pro hle livecodebench scicode bfcl_v3 math-500 aime24 aime25"),
+        ctx=wrap_arguments("gpqa mmlu-pro hle livecodebench scicode math-500 aime24 aime25"),
     )
 
     setup(workspace=args.workspace, cluster=args.cluster, expname_prefix=args.expname_prefix)