diff --git a/nemo_skills/dataset/bfcl_v3/prepare.py b/nemo_skills/dataset/bfcl_v3/prepare.py index 1946249f46..358905bafd 100644 --- a/nemo_skills/dataset/bfcl_v3/prepare.py +++ b/nemo_skills/dataset/bfcl_v3/prepare.py @@ -48,7 +48,6 @@ def ensure_bfcl_eval_installed(): "pip", "install", "--no-cache-dir", - "-e", str(repo_dir / BFCL_EVAL_SUBDIR), "--extra-index-url", BFCL_EXTRA_INDEX_URL, diff --git a/nemo_skills/dataset/gsm-plus/prepare.py b/nemo_skills/dataset/gsm-plus/prepare.py index b94ee1c73f..5dfcecfe76 100644 --- a/nemo_skills/dataset/gsm-plus/prepare.py +++ b/nemo_skills/dataset/gsm-plus/prepare.py @@ -16,13 +16,11 @@ import json import os import pathlib -import urllib.request from pathlib import Path -from nemo_skills.dataset.utils import add_rounding_instruction - -URL = "https://huggingface.co/datasets/qintongli/GSM-Plus/resolve/main/data/test-00000-of-00001.jsonl?download=true" +from datasets import load_dataset +from nemo_skills.dataset.utils import add_rounding_instruction if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -47,10 +45,9 @@ split = "test" data_dir = Path(__file__).absolute().parent data_dir.mkdir(exist_ok=True) - original_file = str(data_dir / f"original_{split}.jsonl") output_file = str(data_dir / f"{split}.jsonl") - urllib.request.urlretrieve(URL, original_file) + dataset = load_dataset("qintongli/GSM-Plus", split="test") file_rounded = None if not args.no_rounding_instructions: @@ -62,10 +59,9 @@ for key in cleaning_options.keys(): cleaning_options[key] = set(cleaning_options[key]) - with open(original_file, "rt") as original, open(output_file, "w") as test_full: - original_data = [json.loads(line) for line in original.readlines()] - cleaning_options["none"] = set(range(len(original_data))) - for i, original_entry in enumerate(original_data): + cleaning_options["none"] = set(range(len(dataset))) + with open(output_file, "w") as test_full: + for i, original_entry in enumerate(dataset): if ( original_entry["perturbation_type"].replace(" ", "_") in args.categories and i in cleaning_options[args.cleaning] @@ -107,6 +103,3 @@ if file_rounded: file_rounded.close() - - # cleaning up - os.remove(original_file) diff --git a/nemo_skills/dataset/scicode/prepare.py b/nemo_skills/dataset/scicode/prepare.py index ba706fe64b..28c6f1fdb7 100644 --- a/nemo_skills/dataset/scicode/prepare.py +++ b/nemo_skills/dataset/scicode/prepare.py @@ -13,47 +13,23 @@ # limitations under the License. import json -import os -import urllib.request from pathlib import Path -URL = "https://huggingface.co/datasets/SciCode1/SciCode/raw/main/problems_{split}.jsonl" - +from datasets import load_dataset if __name__ == "__main__": data_dir = Path(__file__).absolute().parent - for split in ["dev", "test"]: - original_file = str(data_dir / f"original_{split}.jsonl") - data_dir.mkdir(exist_ok=True) - output_file = str(data_dir / f"{split}.jsonl") - - if not os.path.exists(original_file): - urllib.request.urlretrieve(URL.format(split=split), original_file) - - data = [] - with open(original_file, "rt", encoding="utf-8") as fin: - for line in fin: - entry = json.loads(line) - new_entry = entry # TODO? - data.append(new_entry) + data_dir.mkdir(exist_ok=True) - with open(output_file, "wt", encoding="utf-8") as fout: - for entry in data: - fout.write(json.dumps(entry) + "\n") + dataset = load_dataset("SciCode1/SciCode") - # Concate the two to make test_aai - dev_file = data_dir / "dev.jsonl" - test_file = data_dir / "test.jsonl" + split_mapping = {"validation": "dev", "test": "test"} test_aai_file = data_dir / "test_aai.jsonl" - - with open(dev_file, "rt", encoding="utf-8") as fin: - dev_data = [json.loads(line) for line in fin] - with open(test_file, "rt", encoding="utf-8") as fin: - test_data = [json.loads(line) for line in fin] - - test_aai_data = [] - test_aai_data.extend(dev_data) - test_aai_data.extend(test_data) - with open(test_aai_file, "w", encoding="utf-8") as fout: - for entry in test_aai_data: - fout.write(json.dumps(entry) + "\n") + with open(test_aai_file, "w", encoding="utf-8") as test_aai_fout: + for hf_split, output_split in split_mapping.items(): + output_file = data_dir / f"{output_split}.jsonl" + with open(output_file, "wt", encoding="utf-8") as fout: + for entry in dataset[hf_split]: + line = json.dumps(entry) + "\n" + fout.write(line) + test_aai_fout.write(line) diff --git a/tests/slurm-tests/clone_and_run.sh b/tests/slurm-tests/clone_and_run.sh index d151b6d325..fc4b55613e 100755 --- a/tests/slurm-tests/clone_and_run.sh +++ b/tests/slurm-tests/clone_and_run.sh @@ -13,8 +13,10 @@ git clone https://github.com/NVIDIA-NeMo/Skills.git NeMo-Skills cd NeMo-Skills curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=$LOCAL_WORKSPACE sh -$LOCAL_WORKSPACE/uv venv .venv --python 3.10 +$LOCAL_WORKSPACE/uv venv .venv --python 3.10 --seed source .venv/bin/activate +VENV_BIN="$(dirname "$(command -v python)")" +export PATH="$VENV_BIN:$PATH" $LOCAL_WORKSPACE/uv pip install -e . ./tests/slurm-tests/run_all.sh $1 diff --git a/tests/slurm-tests/qwen3_4b_evals/check_results.py b/tests/slurm-tests/qwen3_4b_evals/check_results.py index 2e8f12c4d7..3d939e5ddd 100644 --- a/tests/slurm-tests/qwen3_4b_evals/check_results.py +++ b/tests/slurm-tests/qwen3_4b_evals/check_results.py @@ -23,14 +23,14 @@ TOOLCALLING_METRIC_RANGES = { ("overall_accuracy", "accuracy"): (61.0, 67.0), - ("non_live_single_turn", "overall_non_live", "accuracy"): (84.0, 90.0), - ("non_live_single_turn", "non_live_ast", "accuracy"): (85.0, 92.0), - ("non_live_single_turn", "irrelevance", "accuracy"): (79.0, 86.0), - ("live_single_turn", "overall_live", "accuracy"): (76.0, 83.0), - ("live_single_turn", "live_ast", "accuracy"): (79.0, 86.0), - ("live_single_turn", "live_irrelevance", "accuracy"): (73.0, 80.0), - ("live_single_turn", "live_relevance", "accuracy"): (70.0, 90.0), # unusually high variance - ("multi_turn", "overall_multi_turn", "accuracy"): (20.0, 30.0), + ("overall_non_live", "accuracy"): (84.0, 90.0), + ("non_live_ast", "accuracy"): (85.0, 92.0), + ("non_live_irrelevance", "accuracy"): (79.0, 86.0), + ("overall_live", "accuracy"): (76.0, 83.0), + ("live_ast", "accuracy"): (79.0, 86.0), + ("live_irrelevance", "accuracy"): (73.0, 80.0), + ("live_relevance", "accuracy"): (70.0, 90.0), # unusually high variance + ("overall_multi_turn", "accuracy"): (20.0, 30.0), } @@ -43,7 +43,7 @@ def check_results(eval_dir: str): f = os.path.join(eval_dir, "eval-results", "bfcl_v3", "metrics.json") - data = load_json(f) + data = load_json(f)["bfcl_v3"] for category_tuple, expected_range in TOOLCALLING_METRIC_RANGES.items(): val = float(get_nested_value(data, category_tuple)) lo, hi = expected_range diff --git a/tests/slurm-tests/qwen3_4b_evals/run_test.py b/tests/slurm-tests/qwen3_4b_evals/run_test.py index 7877a2a238..651defa04f 100644 --- a/tests/slurm-tests/qwen3_4b_evals/run_test.py +++ b/tests/slurm-tests/qwen3_4b_evals/run_test.py @@ -22,11 +22,12 @@ def eval_qwen3_bfcl(workspace, cluster, expname_prefix, wandb_project): eval( ctx=wrap_arguments( - f"++inference.temperature=0.6 " - f"++inference.top_p=0.95 " - f"++inference.tokens_to_generate=8192 " - f"++model_name={model} " - f"++parse_reasoning=True " + "++inference.temperature=0.6 " + "++inference.top_p=0.95 " + "++inference.tokens_to_generate=8192 " + # somehow Qwen3-4B was removed from default names, but 8b should be same parser + "++model_name=Qwen/Qwen3-8B-FC " + "++parse_reasoning=True " ), cluster=cluster, benchmarks="bfcl_v3", @@ -57,7 +58,6 @@ def eval_qwen3_online_genselect(workspace, cluster, expname_prefix, wandb_projec "++parallel_thinking.mode=genselect " "++server.enable_soft_fail=True " "++server.context_limit_retry_strategy=reduce_generation " - # "++skip_filled=False " ), cluster=cluster, benchmarks="aime24:1", diff --git a/tests/slurm-tests/super_49b_evals/check_results.py b/tests/slurm-tests/super_49b_evals/check_results.py index b749daf000..39cae2b8f7 100644 --- a/tests/slurm-tests/super_49b_evals/check_results.py +++ b/tests/slurm-tests/super_49b_evals/check_results.py @@ -19,7 +19,7 @@ from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent)) # for utils.py -from utils import assert_all, get_nested_value, load_json, soft_assert # noqa: E402 +from utils import assert_all, load_json, soft_assert # noqa: E402 REASONING_TASKS = [ "math-500", @@ -67,7 +67,7 @@ "aime25": (0.0, 10.0), "gpqa": (49.0, 56.0), "mmlu-pro": (68.0, 71.0), - "livecodebench": (27.5, 32.5), + "livecodebench": (26.0, 32.5), "scicode": { "problem_accuracy": (5.0, 10.0), "subtask_accuracy": (20.0, 28.0), @@ -175,15 +175,6 @@ def check_reasoning(eval_dir: str, mode: str): soft_assert(lo <= val <= hi, f"{bench} ({mode}) {field}={val} out of range [{lo},{hi}]") -def check_toolcalling(eval_dir: str, mode: str): - f = os.path.join(eval_dir, "eval-results", "bfcl_v3", "metrics.json") - data = load_json(f) - for cat, path in TOOLCALLING_METRIC_PATHS.items(): - val = float(get_nested_value(data, path)) - lo, hi = TOOLCALLING_METRIC_RANGES[mode][cat] - soft_assert(lo <= val <= hi, f"TOOL-CALLING ({mode}) {cat}={val} out of range [{lo},{hi}]") - - def check_ruler(eval_dir: str, mode: str): f = os.path.join(eval_dir, "eval-results", "ruler.nemotron_super_128k_slurm_ci", "metrics.json") data = load_json(f) @@ -202,8 +193,6 @@ def main(): check_reasoning(eval_root / "reasoning_off", "reasoning_off") check_reasoning(eval_root / "reasoning_on", "reasoning_on") - check_toolcalling(eval_root / "reasoning_on_tool_calling", "reasoning_on") - check_toolcalling(eval_root / "reasoning_off_tool_calling", "reasoning_off") check_ruler(eval_root / "reasoning_off_ruler", "reasoning_off") assert_all() diff --git a/tests/slurm-tests/super_49b_evals/run_test.py b/tests/slurm-tests/super_49b_evals/run_test.py index d98000e733..dd11eeccde 100644 --- a/tests/slurm-tests/super_49b_evals/run_test.py +++ b/tests/slurm-tests/super_49b_evals/run_test.py @@ -142,31 +142,10 @@ def eval_reasoning_on(workspace, cluster, expname_prefix, wandb_project): wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-on", ) - # BFCL (Reasoning ON) - eval( - ctx=wrap_arguments(f"{common_params} {tokens_to_generate} ++use_client_parsing=False"), - cluster=cluster, - benchmarks="bfcl_v3", - model=base_model, - server_gpus=8, - num_jobs=1, - server_type="vllm", - output_dir=f"{workspace}/reasoning_on_tool_calling", - server_args=( - f"--tool-parser-plugin {base_model}/llama_nemotron_toolcall_parser_no_streaming.py " - f"--tool-call-parser llama_nemotron_json --enable-auto-tool-choice --max-num-seqs=1024" - ), - run_after=f"{expname_prefix}-download-models", - expname=f"{expname_prefix}-bfcl-on", - wandb_project=wandb_project, - wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-on", - ) - return [ f"{expname_prefix}-math-code-science-on", f"{expname_prefix}-livecode-on", f"{expname_prefix}-hle-on", - f"{expname_prefix}-bfcl-on", ] @@ -265,25 +244,6 @@ def eval_reasoning_off(workspace, cluster, expname_prefix, wandb_project): wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-off", ) - # BFCL (Reasoning OFF) - eval( - ctx=wrap_arguments(f"{common_params} {tokens_to_generate} ++use_client_parsing=False"), - cluster=cluster, - benchmarks="bfcl_v3", - model=base_model, - server_gpus=8, - server_type="vllm", - output_dir=f"{workspace}/reasoning_off_tool_calling", - server_args=( - f"--tool-parser-plugin {base_model}/llama_nemotron_toolcall_parser_no_streaming.py " - f"--tool-call-parser llama_nemotron_json --enable-auto-tool-choice --max-num-seqs=1024" - ), - run_after=f"{expname_prefix}-download-models", - expname=f"{expname_prefix}-bfcl-off", - wandb_project=wandb_project, - wandb_name=f"{expname_prefix}-super_49b-eval-reasoning-off", - ) - # RULER (Reasoning OFF) eval( ctx=wrap_arguments(f"{common_params}"), @@ -305,7 +265,6 @@ def eval_reasoning_off(workspace, cluster, expname_prefix, wandb_project): f"{expname_prefix}-math-code-science-off", f"{expname_prefix}-livecode-off", f"{expname_prefix}-hle-off", - f"{expname_prefix}-bfcl-off", f"{expname_prefix}-ruler-off", ] @@ -320,7 +279,7 @@ def main(): args = parser.parse_args() prepare_data( - ctx=wrap_arguments("gpqa mmlu-pro hle livecodebench scicode bfcl_v3 math-500 aime24 aime25"), + ctx=wrap_arguments("gpqa mmlu-pro hle livecodebench scicode math-500 aime24 aime25"), ) setup(workspace=args.workspace, cluster=args.cluster, expname_prefix=args.expname_prefix)