Skip to content
1 change: 0 additions & 1 deletion nemo_skills/dataset/bfcl_v3/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def ensure_bfcl_eval_installed():
"pip",
"install",
"--no-cache-dir",
"-e",
str(repo_dir / BFCL_EVAL_SUBDIR),
"--extra-index-url",
BFCL_EXTRA_INDEX_URL,
Expand Down
19 changes: 6 additions & 13 deletions nemo_skills/dataset/gsm-plus/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@
import json
import os
import pathlib
import urllib.request
from pathlib import Path

from nemo_skills.dataset.utils import add_rounding_instruction

URL = "https://huggingface.co/datasets/qintongli/GSM-Plus/resolve/main/data/test-00000-of-00001.jsonl?download=true"
from datasets import load_dataset

from nemo_skills.dataset.utils import add_rounding_instruction

if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand All @@ -47,10 +45,9 @@
split = "test"
data_dir = Path(__file__).absolute().parent
data_dir.mkdir(exist_ok=True)
original_file = str(data_dir / f"original_{split}.jsonl")
output_file = str(data_dir / f"{split}.jsonl")

urllib.request.urlretrieve(URL, original_file)
dataset = load_dataset("qintongli/GSM-Plus", split="test")

file_rounded = None
if not args.no_rounding_instructions:
Expand All @@ -62,10 +59,9 @@
for key in cleaning_options.keys():
cleaning_options[key] = set(cleaning_options[key])

with open(original_file, "rt") as original, open(output_file, "w") as test_full:
original_data = [json.loads(line) for line in original.readlines()]
cleaning_options["none"] = set(range(len(original_data)))
for i, original_entry in enumerate(original_data):
cleaning_options["none"] = set(range(len(dataset)))
with open(output_file, "w") as test_full:
for i, original_entry in enumerate(dataset):
if (
original_entry["perturbation_type"].replace(" ", "_") in args.categories
and i in cleaning_options[args.cleaning]
Expand Down Expand Up @@ -107,6 +103,3 @@

if file_rounded:
file_rounded.close()

# cleaning up
os.remove(original_file)
48 changes: 12 additions & 36 deletions nemo_skills/dataset/scicode/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,23 @@
# limitations under the License.

import json
import os
import urllib.request
from pathlib import Path

URL = "https://huggingface.co/datasets/SciCode1/SciCode/raw/main/problems_{split}.jsonl"

from datasets import load_dataset

if __name__ == "__main__":
data_dir = Path(__file__).absolute().parent
for split in ["dev", "test"]:
original_file = str(data_dir / f"original_{split}.jsonl")
data_dir.mkdir(exist_ok=True)
output_file = str(data_dir / f"{split}.jsonl")

if not os.path.exists(original_file):
urllib.request.urlretrieve(URL.format(split=split), original_file)

data = []
with open(original_file, "rt", encoding="utf-8") as fin:
for line in fin:
entry = json.loads(line)
new_entry = entry # TODO?
data.append(new_entry)
data_dir.mkdir(exist_ok=True)

with open(output_file, "wt", encoding="utf-8") as fout:
for entry in data:
fout.write(json.dumps(entry) + "\n")
dataset = load_dataset("SciCode1/SciCode")

# Concate the two to make test_aai
dev_file = data_dir / "dev.jsonl"
test_file = data_dir / "test.jsonl"
split_mapping = {"validation": "dev", "test": "test"}
test_aai_file = data_dir / "test_aai.jsonl"

with open(dev_file, "rt", encoding="utf-8") as fin:
dev_data = [json.loads(line) for line in fin]
with open(test_file, "rt", encoding="utf-8") as fin:
test_data = [json.loads(line) for line in fin]

test_aai_data = []
test_aai_data.extend(dev_data)
test_aai_data.extend(test_data)
with open(test_aai_file, "w", encoding="utf-8") as fout:
for entry in test_aai_data:
fout.write(json.dumps(entry) + "\n")
with open(test_aai_file, "w", encoding="utf-8") as test_aai_fout:
for hf_split, output_split in split_mapping.items():
output_file = data_dir / f"{output_split}.jsonl"
with open(output_file, "wt", encoding="utf-8") as fout:
for entry in dataset[hf_split]:
line = json.dumps(entry) + "\n"
fout.write(line)
test_aai_fout.write(line)
Comment on lines +28 to +35
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

File test_aai.jsonl is being opened and written to in outer scope, but also written within nested loop for each split. The outer with block keeps the file open during the entire loop, meaning both dev.jsonl and test.jsonl entries are written sequentially. However, there's no guarantee the file is properly synced since writes happen concurrently through both fout and test_aai_fout file handles. While this may work, it's cleaner to write to test_aai.jsonl after the split files are complete.

Additionally, per CONTRIBUTING.md guidelines (line 40-42), avoid data loss by completing computation before writing. Consider writing all files separately first, then concatenating to avoid partial writes if there's a failure.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

4 changes: 3 additions & 1 deletion tests/slurm-tests/clone_and_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ git clone https://github.com/NVIDIA-NeMo/Skills.git NeMo-Skills
cd NeMo-Skills

curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=$LOCAL_WORKSPACE sh
$LOCAL_WORKSPACE/uv venv .venv --python 3.10
$LOCAL_WORKSPACE/uv venv .venv --python 3.10 --seed
source .venv/bin/activate
VENV_BIN="$(dirname "$(command -v python)")"
export PATH="$VENV_BIN:$PATH"
$LOCAL_WORKSPACE/uv pip install -e .

./tests/slurm-tests/run_all.sh $1
18 changes: 9 additions & 9 deletions tests/slurm-tests/qwen3_4b_evals/check_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@

TOOLCALLING_METRIC_RANGES = {
("overall_accuracy", "accuracy"): (61.0, 67.0),
("non_live_single_turn", "overall_non_live", "accuracy"): (84.0, 90.0),
("non_live_single_turn", "non_live_ast", "accuracy"): (85.0, 92.0),
("non_live_single_turn", "irrelevance", "accuracy"): (79.0, 86.0),
("live_single_turn", "overall_live", "accuracy"): (76.0, 83.0),
("live_single_turn", "live_ast", "accuracy"): (79.0, 86.0),
("live_single_turn", "live_irrelevance", "accuracy"): (73.0, 80.0),
("live_single_turn", "live_relevance", "accuracy"): (70.0, 90.0), # unusually high variance
("multi_turn", "overall_multi_turn", "accuracy"): (20.0, 30.0),
("overall_non_live", "accuracy"): (84.0, 90.0),
("non_live_ast", "accuracy"): (85.0, 92.0),
("non_live_irrelevance", "accuracy"): (79.0, 86.0),
("overall_live", "accuracy"): (76.0, 83.0),
("live_ast", "accuracy"): (79.0, 86.0),
("live_irrelevance", "accuracy"): (73.0, 80.0),
("live_relevance", "accuracy"): (70.0, 90.0), # unusually high variance
("overall_multi_turn", "accuracy"): (20.0, 30.0),
}


Expand All @@ -43,7 +43,7 @@

def check_results(eval_dir: str):
f = os.path.join(eval_dir, "eval-results", "bfcl_v3", "metrics.json")
data = load_json(f)
data = load_json(f)["bfcl_v3"]
for category_tuple, expected_range in TOOLCALLING_METRIC_RANGES.items():
val = float(get_nested_value(data, category_tuple))
lo, hi = expected_range
Expand Down
12 changes: 6 additions & 6 deletions tests/slurm-tests/qwen3_4b_evals/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ def eval_qwen3_bfcl(workspace, cluster, expname_prefix, wandb_project):

eval(
ctx=wrap_arguments(
f"++inference.temperature=0.6 "
f"++inference.top_p=0.95 "
f"++inference.tokens_to_generate=8192 "
f"++model_name={model} "
f"++parse_reasoning=True "
"++inference.temperature=0.6 "
"++inference.top_p=0.95 "
"++inference.tokens_to_generate=8192 "
# somehow Qwen3-4B was removed from default names, but 8b should be same parser
"++model_name=Qwen/Qwen3-8B-FC "
"++parse_reasoning=True "
),
cluster=cluster,
benchmarks="bfcl_v3",
Expand Down Expand Up @@ -57,7 +58,6 @@ def eval_qwen3_online_genselect(workspace, cluster, expname_prefix, wandb_projec
"++parallel_thinking.mode=genselect "
"++server.enable_soft_fail=True "
"++server.context_limit_retry_strategy=reduce_generation "
# "++skip_filled=False "
),
cluster=cluster,
benchmarks="aime24:1",
Expand Down
15 changes: 2 additions & 13 deletions tests/slurm-tests/super_49b_evals/check_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent)) # for utils.py
from utils import assert_all, get_nested_value, load_json, soft_assert # noqa: E402
from utils import assert_all, load_json, soft_assert # noqa: E402

REASONING_TASKS = [
"math-500",
Expand Down Expand Up @@ -67,7 +67,7 @@
"aime25": (0.0, 10.0),
"gpqa": (49.0, 56.0),
"mmlu-pro": (68.0, 71.0),
"livecodebench": (27.5, 32.5),
"livecodebench": (26.0, 32.5),
"scicode": {
"problem_accuracy": (5.0, 10.0),
"subtask_accuracy": (20.0, 28.0),
Expand Down Expand Up @@ -175,15 +175,6 @@ def check_reasoning(eval_dir: str, mode: str):
soft_assert(lo <= val <= hi, f"{bench} ({mode}) {field}={val} out of range [{lo},{hi}]")


def check_toolcalling(eval_dir: str, mode: str):
f = os.path.join(eval_dir, "eval-results", "bfcl_v3", "metrics.json")
data = load_json(f)
for cat, path in TOOLCALLING_METRIC_PATHS.items():
val = float(get_nested_value(data, path))
lo, hi = TOOLCALLING_METRIC_RANGES[mode][cat]
soft_assert(lo <= val <= hi, f"TOOL-CALLING ({mode}) {cat}={val} out of range [{lo},{hi}]")


def check_ruler(eval_dir: str, mode: str):
f = os.path.join(eval_dir, "eval-results", "ruler.nemotron_super_128k_slurm_ci", "metrics.json")
data = load_json(f)
Expand All @@ -202,8 +193,6 @@ def main():

check_reasoning(eval_root / "reasoning_off", "reasoning_off")
check_reasoning(eval_root / "reasoning_on", "reasoning_on")
check_toolcalling(eval_root / "reasoning_on_tool_calling", "reasoning_on")
check_toolcalling(eval_root / "reasoning_off_tool_calling", "reasoning_off")
check_ruler(eval_root / "reasoning_off_ruler", "reasoning_off")

assert_all()
Expand Down
43 changes: 1 addition & 42 deletions tests/slurm-tests/super_49b_evals/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,31 +142,10 @@ def eval_reasoning_on(workspace, cluster, expname_prefix, wandb_project):
wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-on",
)

# BFCL (Reasoning ON)
eval(
ctx=wrap_arguments(f"{common_params} {tokens_to_generate} ++use_client_parsing=False"),
cluster=cluster,
benchmarks="bfcl_v3",
model=base_model,
server_gpus=8,
num_jobs=1,
server_type="vllm",
output_dir=f"{workspace}/reasoning_on_tool_calling",
server_args=(
f"--tool-parser-plugin {base_model}/llama_nemotron_toolcall_parser_no_streaming.py "
f"--tool-call-parser llama_nemotron_json --enable-auto-tool-choice --max-num-seqs=1024"
),
run_after=f"{expname_prefix}-download-models",
expname=f"{expname_prefix}-bfcl-on",
wandb_project=wandb_project,
wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-on",
)

return [
f"{expname_prefix}-math-code-science-on",
f"{expname_prefix}-livecode-on",
f"{expname_prefix}-hle-on",
f"{expname_prefix}-bfcl-on",
]


Expand Down Expand Up @@ -265,25 +244,6 @@ def eval_reasoning_off(workspace, cluster, expname_prefix, wandb_project):
wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-off",
)

# BFCL (Reasoning OFF)
eval(
ctx=wrap_arguments(f"{common_params} {tokens_to_generate} ++use_client_parsing=False"),
cluster=cluster,
benchmarks="bfcl_v3",
model=base_model,
server_gpus=8,
server_type="vllm",
output_dir=f"{workspace}/reasoning_off_tool_calling",
server_args=(
f"--tool-parser-plugin {base_model}/llama_nemotron_toolcall_parser_no_streaming.py "
f"--tool-call-parser llama_nemotron_json --enable-auto-tool-choice --max-num-seqs=1024"
),
run_after=f"{expname_prefix}-download-models",
expname=f"{expname_prefix}-bfcl-off",
wandb_project=wandb_project,
wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-off",
)

# RULER (Reasoning OFF)
eval(
ctx=wrap_arguments(f"{common_params}"),
Expand All @@ -305,7 +265,6 @@ def eval_reasoning_off(workspace, cluster, expname_prefix, wandb_project):
f"{expname_prefix}-math-code-science-off",
f"{expname_prefix}-livecode-off",
f"{expname_prefix}-hle-off",
f"{expname_prefix}-bfcl-off",
f"{expname_prefix}-ruler-off",
]

Expand All @@ -320,7 +279,7 @@ def main():
args = parser.parse_args()

prepare_data(
ctx=wrap_arguments("gpqa mmlu-pro hle livecodebench scicode bfcl_v3 math-500 aime24 aime25"),
ctx=wrap_arguments("gpqa mmlu-pro hle livecodebench scicode math-500 aime24 aime25"),
)

setup(workspace=args.workspace, cluster=args.cluster, expname_prefix=args.expname_prefix)
Expand Down