Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
848 commits
Select commit Hold shift + click to select a range
9dc70bd
Fix timeout bug for LEAN4 code execution (#647)
fchen97 Aug 7, 2025
27f5e44
rename folder, add datasets (#651)
ekmb Aug 7, 2025
62d0845
Improve client (#652)
smahdavi4 Aug 7, 2025
3806646
Fixes for code execution (#656)
Kipok Aug 8, 2025
446ed09
Add skip_special_tokens=False for completions (#657)
Kipok Aug 8, 2025
33a8baa
Fix timeout raise in sandbox
Kipok Aug 9, 2025
4cfd725
Fix timeout for code exec
Kipok Aug 9, 2025
5cd44ab
Update annotation
Kipok Aug 9, 2025
8755ab5
Update timeout to 4 hours
Kipok Aug 9, 2025
4e931e6
Online GenSelect (#655)
shtoshni Aug 11, 2025
ed68025
Adding long context benchmark MRCR (#634)
fayejf Aug 11, 2025
45ac117
Fix a small bug in generation with chunks (#661)
fchen97 Aug 12, 2025
1bd199d
Small fix for mrcr prepare.py (#662)
fayejf Aug 12, 2025
58af382
Fix base checkpoints in the docs
Kipok Aug 12, 2025
6a12754
Fix formatting
Kipok Aug 12, 2025
c9d361d
Fix type mismatch for max code executions (#665)
Kipok Aug 13, 2025
642b92a
Allow generation type or custom module in `eval` pipeline (#666)
activatedgeek Aug 13, 2025
f200cac
update grpo with megatron backend (#653)
wedu-nvidia Aug 13, 2025
34b90a5
bugfix: missing generation module arg in eval pipeline cmd script (#668)
activatedgeek Aug 13, 2025
eab2d58
add support for nsys profile (#667)
wedu-nvidia Aug 14, 2025
8c27f28
Fixing BFCL (#669)
shtoshni Aug 14, 2025
eba855e
Minor fixes to dataset defaults (#672)
shtoshni Aug 14, 2025
be737a0
Enable system_message for openai prompt format (#670)
Kipok Aug 14, 2025
e88c084
Reproducing Llama Nemotron Results with NeMo-Skills (#676)
shtoshni Aug 15, 2025
e600c78
Fixes for docs
Kipok Aug 15, 2025
3022a9b
Add SWE-bench inference & evaluation (#671)
ludwig-n Aug 15, 2025
5b00e48
Remove prompt template (#673)
Kipok Aug 16, 2025
86ee4e4
allow overlapping sandbox with run_cmd (#680)
SeanNaren Aug 16, 2025
ff19286
Majority + Pass@k fix (#679)
shtoshni Aug 18, 2025
f075214
Remove broken benchmark
Kipok Aug 18, 2025
f1fb1b3
Pin openai version to v1.99.9 (#684)
ludwig-n Aug 18, 2025
9a47605
Allow customizing SWE-agent/OpenHands repo & commit for SWE-bench (#682)
ludwig-n Aug 18, 2025
5df438b
Fix 'Argument list too long' error in get_remaining_jobs (#674)
tamohannes Aug 18, 2025
7dace78
fixing conflicts
wasiahmad Aug 18, 2025
b178cc9
fixing conflicts
wasiahmad Aug 18, 2025
e020abd
fix data cache bug (#685)
wedu-nvidia Aug 18, 2025
71e4fa3
Fixes for ruler, simplified recipe and some docs (#686)
Kipok Aug 18, 2025
397943f
Update prepare.py (#687)
fayejf Aug 18, 2025
4cd131d
lcb updates to work with pypy3
wasiahmad Aug 19, 2025
a78f0e7
lcb updates to work with pypy3
wasiahmad Aug 19, 2025
e2a687a
Patch for LCB score calculation fix (#688)
wasiahmad Aug 19, 2025
cc69fd4
Fix cache variable (#692)
smahdavi4 Aug 19, 2025
0dab4a8
Add docs for ruler repro in tutorial (#693)
Kipok Aug 20, 2025
47f7fe7
New evaluation docs (#694)
Kipok Aug 20, 2025
c672325
Add support for api_key_env_var (#696)
Glorf Aug 21, 2025
722422b
fix: scicode missing functions (#699)
jubick1337 Aug 21, 2025
2e21e9b
BFCL Multi turn fixes (#704)
shtoshni Aug 21, 2025
b5c289a
Scicode fixed numbers for Llama Nemotron (#705)
shtoshni Aug 21, 2025
1d8b332
Fix links (#706)
Kipok Aug 21, 2025
85a66ce
Update CONTRIBUTING.md (#697)
activatedgeek Aug 21, 2025
096f52a
gpt-oss integration (#711)
Kipok Aug 22, 2025
db8f2e5
Error handing for long context stuff (#708)
shtoshni Aug 22, 2025
2fa7da7
Support gemini api (#698)
hsiehjackson Aug 22, 2025
7685b2f
add beyondaime (#707)
wedu-nvidia Aug 22, 2025
e6f6a89
Update lcb splits (#714)
shtoshni Aug 22, 2025
fcaf706
handle exp name too long (#716)
fayejf Aug 22, 2025
6eecc91
Add IOI (#701)
SeanNaren Aug 22, 2025
fc1c13f
Nano v2 tutorial (#718)
shtoshni Aug 23, 2025
4b9b2d0
Fix link in readme
Kipok Aug 23, 2025
b26df04
Removing conversion tests (#720)
shtoshni Aug 25, 2025
4bcdcf9
Fix shell execution interface (#719)
Kipok Aug 25, 2025
6c1166e
BFCL v3 data prep - Repo version pinning (#725)
shtoshni Aug 26, 2025
34d961b
add hf_home check (#702)
wedu-nvidia Aug 26, 2025
095f1aa
Ipython session affinity (#622)
gwarmstrong Aug 26, 2025
afd373d
Add SFT data translation module (#721)
shuoyangd Aug 26, 2025
1d3fd35
Fix streaming in async code execution (#722)
Kipok Aug 26, 2025
120af1e
Add SWE-bench docs (#726)
ludwig-n Aug 27, 2025
4cf1f48
fix model path in docs/basics aime eval example (#728)
stephencge Aug 27, 2025
d921a1a
Adding MCP clients (#713)
gwarmstrong Aug 27, 2025
c0b5022
Cleaning up references to TRTLLM model (#731)
shtoshni Aug 27, 2025
53f83c7
Small fix for web search (#732)
smahdavi4 Aug 27, 2025
07f1163
Fix judge for non-default benchmarks (#727)
Kipok Aug 27, 2025
75e84d1
Fix SciCode evaluation (#730)
jubick1337 Aug 27, 2025
31bcbfd
Fix apptainer installation in docker (#734)
Kipok Aug 27, 2025
bd998ff
Defer server wait for generation (#735)
smahdavi4 Aug 28, 2025
a5f3bcc
HLE Fix (#733)
shtoshni Aug 28, 2025
d922da3
Update evaluation numbers and a default split for scicode (#739)
Kipok Aug 28, 2025
173f811
Add quotes
Kipok Aug 28, 2025
1899ad1
Tool support based on Chat Completion APIs (#717)
activatedgeek Aug 28, 2025
07ad14c
Fix ruff config (#740)
activatedgeek Aug 28, 2025
06cdacb
Remove pin from openai req (#742)
Kipok Aug 28, 2025
0367809
Parallel evaluation on a single node (#743)
Kipok Aug 28, 2025
e256ffd
Small fix to parallel mode
Kipok Aug 28, 2025
3ac390f
Scicode updates to plot (#744)
shtoshni Aug 28, 2025
85baa02
Soft failure (#723)
shtoshni Aug 29, 2025
2fecec3
feat: unit test on slurm (#675)
wedu-nvidia Aug 29, 2025
c8c6382
AIMO inference tutorial (#745)
darraghdog Aug 29, 2025
02b6c78
Bug Fix w/ Translation Module (#746)
shuoyangd Aug 29, 2025
a740b54
Updates to file formatting (#747)
shtoshni Aug 29, 2025
69ff32b
Precommit fixes more complex (#748)
shtoshni Aug 29, 2025
6f1a687
Add a tutorial on running gpt-oss with python tool (#750)
Kipok Aug 29, 2025
9e7774c
Fix missing import
Kipok Aug 30, 2025
f902d45
BFCL Docs (#753)
shtoshni Aug 30, 2025
d2ee015
Slurm tests enhancements (#754)
Kipok Aug 30, 2025
391652d
Remove wandb_project
Kipok Aug 30, 2025
38de272
Fixes for slurm tests (#755)
Kipok Aug 30, 2025
2c5f31d
More fixes for slurm tests and nemo-rl sft (#756)
Kipok Sep 1, 2025
b092a9c
update nemo-rl to latest main (#752)
wedu-nvidia Sep 1, 2025
984e00c
update nemo-rl to latest main (#752)
wedu-nvidia Sep 1, 2025
36f9c91
Server container now can be passed with CLI (#758)
vmendelev Sep 3, 2025
7eed4be
BFCL v3 Testing + Refactoring (#761)
shtoshni Sep 3, 2025
b80d807
Add MMLU-Pro-X (#751)
shuoyangd Sep 3, 2025
6a3a0b0
Prompt examples added + Minor changes to prompt construction (#764)
shtoshni Sep 3, 2025
383b428
Generate docs with context length error part added (#765)
shtoshni Sep 3, 2025
61ac015
Natural Language math docs (#767)
avem-nv Sep 4, 2025
e2f23a1
Reduce default `max_concurrent_requests` back to 512 (#770)
shuoyangd Sep 4, 2025
0fee3e4
disable validation for nemo-rl if no validation data is provided (#766)
wedu-nvidia Sep 4, 2025
27e4800
Adjust slurm parameters (#771)
Kipok Sep 4, 2025
367ae37
Update to slurm tests setup
Kipok Sep 4, 2025
c6ad546
Update instruction for cron
Kipok Sep 4, 2025
51ee492
Update test constraints
Kipok Sep 4, 2025
2c372b6
Small doc fix
Kipok Sep 4, 2025
d463ba3
MCP interface updates (#772)
gwarmstrong Sep 4, 2025
5e94850
Fix to not use completions api when soft_fail=True (#774)
Kipok Sep 4, 2025
3be5070
Fix MCP env propagation (#775)
gwarmstrong Sep 4, 2025
af217f6
Fixing docs (#776)
shtoshni Sep 4, 2025
0c26b5a
Update slurm tests (#777)
Kipok Sep 5, 2025
aaffceb
Update constraints
Kipok Sep 5, 2025
1a27645
Fix async tool registration (#781)
gwarmstrong Sep 5, 2025
988130c
Fix OpenHands patch issue & add more info to SWE-bench docs (#778)
ludwig-n Sep 5, 2025
f59b6c0
Potential solution to logging duplication (#782)
shtoshni Sep 5, 2025
fccb153
TRTLLM + Soft Fail (#786)
shtoshni Sep 5, 2025
60430e5
Add litellm cache (#789)
smahdavi4 Sep 7, 2025
75fd5b8
GenSelect Online (#783)
shtoshni Sep 7, 2025
52d544a
Add standard deviation metrics for benchmark variance analysis (#757)
AdamRajfer Sep 8, 2025
3413dc6
GenSelect -> GenEvolution (#791)
shtoshni Sep 8, 2025
a9f31e5
Slurm tests refactoring (#795)
Kipok Sep 8, 2025
0364637
GenEvolution docs (#792)
shtoshni Sep 8, 2025
fda85a3
Fix vllm multi-node + add conversion to int for gpus (#796)
Kipok Sep 9, 2025
d9dc1b9
Gpt-oss-python slurm test + more small refactoring (#797)
Kipok Sep 9, 2025
2729707
gen select/synth default_factory (#800)
stephencge Sep 9, 2025
1aca76e
Update constraints
Kipok Sep 9, 2025
26f591d
Fix SWE-bench parallel runs and other issues (#794)
ludwig-n Sep 9, 2025
db07c74
updated config for sft data prep (#785)
wasiahmad Sep 9, 2025
cb4c0be
lean eval last code block and has sorry logic fix (#801)
stephencge Sep 10, 2025
a02d527
update max_tokens to max_completion_tokens for openai api (#804)
jiacheng-xu Sep 10, 2025
e6c27aa
Add build stage for docker images (#805)
gwarmstrong Sep 11, 2025
5730a9e
FIX Cleanup Sessions on timeout (#803)
gwarmstrong Sep 11, 2025
47740e8
Add search to docs (#807)
darraghdog Sep 11, 2025
6780b53
enable msg format data passing for sft (#806)
wasiahmad Sep 11, 2025
e03d046
fix hf_model as None bug (#808)
wedu-nvidia Sep 12, 2025
f5614ad
add lr scheduler for nemo-rl sft with fsdp as backend (#759)
wedu-nvidia Sep 13, 2025
a9b3ca5
GenSynthesis prompt updates (#809)
shtoshni Sep 14, 2025
4d3f817
Make OpenHands use uploaded dataset instead of redownloading from HF …
ludwig-n Sep 15, 2025
ed25dda
Update constraints
Kipok Sep 15, 2025
c74bdeb
Unifying context length error handling (#812)
shtoshni Sep 16, 2025
1f45d0c
add MathOlympiadBench (#814)
stephencge Sep 16, 2025
174d271
update miniF2F dataset (#813)
stephencge Sep 17, 2025
58f3d55
fix typo (#816)
wedu-nvidia Sep 18, 2025
678cced
Fix tool calling (#815)
Kipok Sep 18, 2025
a2a68ed
Update filters.trim_solutions=false to be default (#817)
Kipok Sep 18, 2025
6decc47
Fix typo in grpo and tests (#818)
Kipok Sep 18, 2025
e2bd5dd
Evaluation on BigCodeBench (#547)
wasiahmad Sep 18, 2025
f8897ac
A small bug fix (#819)
wasiahmad Sep 18, 2025
c6a656f
Disallow empty prepare_data (#820)
Kipok Sep 18, 2025
75c1148
Move sequence parallel to top level policy (#823)
Kipok Sep 18, 2025
00ebe52
Evaluation on LiveBench-Coding (#821)
wasiahmad Sep 18, 2025
e0a26f7
Expose timeout parameter for individual calls (#827)
Kipok Sep 19, 2025
6492a70
Implement token std statistics (#826)
AdamRajfer Sep 19, 2025
478765d
Add Long context benchmark AA-LCR (#798)
fayejf Sep 19, 2025
8160c6f
fixing merge conflicts
wasiahmad Sep 19, 2025
906246d
fixing merge conflicts
wasiahmad Sep 19, 2025
bb21b6b
code logic reorganized
wasiahmad Sep 19, 2025
b15b3a0
code logic reorganized
wasiahmad Sep 19, 2025
3a8b9ad
lcb eval harness main branch need to be used with pypy3
wasiahmad Sep 19, 2025
2d8a042
lcb eval harness main branch need to be used with pypy3
wasiahmad Sep 19, 2025
df93e20
Update default sandbox parameters (#830)
Kipok Sep 20, 2025
3163ea8
update to latest commit (#831)
wedu-nvidia Sep 20, 2025
c2ba406
Bump nemo-rl version to 0.7.1 (#832)
Kipok Sep 21, 2025
68d6144
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 22, 2025
ebd5bab
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 22, 2025
2da40f1
Add aa lcr to aai (#836)
fayejf Sep 22, 2025
69b501f
Add new benchmark SimpleQA to nemo_skills (#828)
jiacheng-xu Sep 22, 2025
2afdbfe
hle with detail splits (#837)
jiacheng-xu Sep 23, 2025
9b7ce0c
timeout should be int
wasiahmad Sep 23, 2025
66dbaa6
timeout should be int
wasiahmad Sep 23, 2025
6b7af16
separating lcb code eval into a different file
wasiahmad Sep 23, 2025
93ec8b4
separating lcb code eval into a different file
wasiahmad Sep 23, 2025
e1842df
fixing minor issue
wasiahmad Sep 23, 2025
a680edc
fixing minor issue
wasiahmad Sep 23, 2025
b3d4612
fixing minor issue
wasiahmad Sep 23, 2025
4153e04
fixing minor issue
wasiahmad Sep 23, 2025
d64fa43
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 23, 2025
abb4805
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 23, 2025
e6d6b02
minor updates
wasiahmad Sep 23, 2025
6af340e
minor updates
wasiahmad Sep 23, 2025
1abbb9c
Asynchronous eval in Generation Loop (#825)
gwarmstrong Sep 23, 2025
2d294ff
further optimizations
wasiahmad Sep 24, 2025
d2388c4
further optimizations
wasiahmad Sep 24, 2025
6e68efa
NeMo-RL SFT sample printing (to verify if template is applied) (#833)
wasiahmad Sep 25, 2025
5196289
Update default MCQ prompts (GPQA, MMLU-Pro) to non-boxed format (#843)
ekmb Sep 25, 2025
b25de59
Only run env var check for identity file when key available (#850)
activatedgeek Sep 26, 2025
ba480a6
minor issue fix
wasiahmad Sep 26, 2025
20888db
minor issue fix
wasiahmad Sep 26, 2025
a9c0bf2
minor issue fix
wasiahmad Sep 26, 2025
b887bc7
minor issue fix
wasiahmad Sep 26, 2025
1f2985c
fixing indent issue
wasiahmad Sep 26, 2025
976e6da
fixing indent issue
wasiahmad Sep 26, 2025
0fb1b8e
fixing file issues
wasiahmad Sep 26, 2025
c10e26a
fixing file issues
wasiahmad Sep 26, 2025
bf6231a
Sandbox history restoration fix (#838)
i-vainn Sep 26, 2025
1d64b6c
changing lcb eval harness url
wasiahmad Sep 26, 2025
27f886b
changing lcb eval harness url
wasiahmad Sep 26, 2025
268d971
Merge remote-tracking branch 'origin/main' into feat/lcb_eval
wasiahmad Sep 26, 2025
31db2de
Merge remote-tracking branch 'origin/main' into feat/lcb_eval
wasiahmad Sep 26, 2025
7798899
pypy3 testing with datasets
wasiahmad Sep 26, 2025
0c73f0e
pypy3 testing with datasets
wasiahmad Sep 26, 2025
694c107
keeping test cases
wasiahmad Sep 26, 2025
aa6c209
keeping test cases
wasiahmad Sep 26, 2025
3c3a73d
keeping test cases for pypy3 use
wasiahmad Sep 26, 2025
3df9308
keeping test cases for pypy3 use
wasiahmad Sep 26, 2025
c759a74
keeping test cases for pypy3 use
wasiahmad Sep 26, 2025
758333b
keeping test cases for pypy3 use
wasiahmad Sep 26, 2025
2829c20
debugging
wasiahmad Sep 26, 2025
07a13cb
debugging
wasiahmad Sep 26, 2025
c3f6a23
fix data prep issues
wasiahmad Sep 26, 2025
a6eab74
fix data prep issues
wasiahmad Sep 26, 2025
193c940
dataset preparation updated
wasiahmad Sep 26, 2025
47798dc
dataset preparation updated
wasiahmad Sep 26, 2025
8597857
changing lcb eval harness branch name
wasiahmad Sep 27, 2025
b096d9b
changing lcb eval harness branch name
wasiahmad Sep 27, 2025
9e2e0bd
Slurm: fix time format, and allow default timeout (#853)
artbataev Sep 28, 2025
7c61cf1
Prompt sensitivity (multiprompt eval) support (#847)
gnalbandyan Sep 29, 2025
59430b6
Fix HF_TOKEN assignment. Fix env vars priority: config -> environment…
artbataev Sep 29, 2025
8fa3fc2
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 29, 2025
1ed6bb1
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 29, 2025
5e9eaf7
Megatron backend changes: minor fix, add random ports (#862)
lizziew Sep 29, 2025
b849bfb
fix wandb (#859)
wedu-nvidia Sep 29, 2025
9ee8ea4
Allow setting random seeds for benchmark groups (#860)
Kipok Sep 30, 2025
1a59903
Generation time + Input Sequence Length (#865)
shtoshni Sep 30, 2025
d2c5863
Small for for isl calc (#868)
Kipok Sep 30, 2025
680d00b
Proper fix for isl (#869)
Kipok Sep 30, 2025
2650a75
Adding support for arm64 containers (#856)
Kipok Sep 30, 2025
a5bede1
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 30, 2025
0751a89
Merge branch 'main' into feat/lcb_eval
wasiahmad Sep 30, 2025
dd27f01
adding LCB docs
wasiahmad Sep 30, 2025
0ccde9f
adding LCB docs
wasiahmad Sep 30, 2025
cc875fd
revert nemo-rl patch (#871)
activatedgeek Sep 30, 2025
bdf6b37
Remove sharding docs (#872)
smahdavi4 Sep 30, 2025
65e99b2
Adding support for training with megatron-lm (#873)
Kipok Oct 1, 2025
1050ecf
Merge branch 'main' into feat/lcb_eval
wasiahmad Oct 1, 2025
60087ac
Merge branch 'main' into feat/lcb_eval
wasiahmad Oct 1, 2025
ca09081
Evaluation on OJBench (#848)
wasiahmad Oct 1, 2025
9028635
fixing merge conflicts
wasiahmad Oct 1, 2025
4970d6e
fixing merge conflicts
wasiahmad Oct 1, 2025
4042286
resolving conflicts
wasiahmad Oct 1, 2025
27fde4f
merging
wasiahmad Oct 1, 2025
3729e9a
updating docs
wasiahmad Oct 1, 2025
29dad0f
Merge branch 'main' into feat/lcb_eval
wasiahmad Oct 1, 2025
d050d9f
minor doc update
wasiahmad Oct 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added __init__.py
Empty file.
82 changes: 82 additions & 0 deletions docs/evaluation/code.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,88 @@ all you need to do is replace `openhands` with `swe_agent` in the command above.
- Benchmark is defined in [`nemo_skills/dataset/livecodebench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench/__init__.py)
- Original benchmark source is [here](https://github.com/LiveCodeBench/LiveCodeBench).

#### Data Preparation

First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test_v6_2408_2505.jsonl`.

```
ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05
```

##### For Pypy3 Evaluation:
If you plan to evaluate using the Pypy3 interpreter, you must add the `--keep_all_columns` flag during data preparation. This will download a larger dataset (~1.9GB) containing the necessary test cases. So, we recommend downloading the dataset into a Slurm cluster location.

```
ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05 --keep_all_columns --cluster=<CLUSTER_NAME> --data_dir=<DATA_DIR>
```

#### Running the Evaluation

Once the data is prepared, you can run the evaluation. Replace `<...>` placeholders with your cluster and directory paths.

##### Standard Python Evaluation

This command runs an evaluation of [OpenReasoning-Nemotron-32B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-32B) on a Slurm cluster.

```
ns eval \
--cluster=<CLUSTER_NAME> \
--model=nvidia/OpenReasoning-Nemotron-32B \
--server_type=vllm \
--server_args="--async-scheduling" \
--server_nodes=1 \
--server_gpus=8 \
--benchmarks=livecodebench \
--split=test_v6_2408_2505 \
--data_dir=<DATA_DIR> \
--output_dir=<OUTPUT_DIR> \
--extra_eval_args="++eval_config.interpreter=python" \
--with_sandbox \
++inference.temperature=0.6 \
++inference.top_p=0.95 \
++inference.tokens_to_generate=65536
```

##### Pypy3 Evaluation

To run with the Pypy3 interpreter, modify the `--extra_eval_args` flag as shown below.
```
--extra_eval_args="++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl"
```
Comment on lines +227 to +228
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Tag the interpreter override snippet as bash.

Prevents MD040 lint failures.

-```
+```bash
 --extra_eval_args="++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl"
-```
+```
🤖 Prompt for AI Agents
In docs/evaluation/code.md around lines 227-228, the code fence containing the
--extra_eval_args snippet is not tagged with a language which triggers MD040;
update the opening fence to ```bash so the snippet is explicitly labeled as bash
and ensure the closing ``` fence remains present and correctly placed.


##### Verifying Results

After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/livecodebench/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/livecodebench/summarized-results/main_*` They should look something like this:

```
-------------------------- livecodebench --------------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
pass@1 | 454 | 15995 | 2188 | 71.15%


------------------------ livecodebench-easy -----------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
pass@1 | 110 | 5338 | 1806 | 99.09%


------------------------ livecodebench-hard -----------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
pass@1 | 203 | 23031 | 2188 | 46.31%


----------------------- livecodebench-medium ----------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
pass@1 | 141 | 14178 | 1889 | 85.11%
```
Comment on lines +235 to +253
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Declare the metrics block as plain text.

The ASCII table isn’t JSON; marking it as text satisfies MD040 and keeps formatting intact.

-```
+```text
 -------------------------- livecodebench --------------------------
 evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
@@
 pass@1          | 141         | 14178      | 1889        | 85.11%
-```
+```
🤖 Prompt for AI Agents
In docs/evaluation/code.md around lines 235 to 253, the ASCII metrics table is
currently in a fenced code block without a language, triggering MD040; change
the fence to declare the block as plain text by adding "text" after the opening
triple backticks (i.e., use ```text) and keep the closing triple backticks
unchanged so the table renders as plain text and preserves formatting.


##### Advanced: Averaging Multiple Runs

Due to variance between runs, you can automatically repeat the evaluation and average the results. To run the evaluation 3 times, for example, set the `--benchmarks` flag as follows:

```
--benchmarks=livecodebench:3
```

### livecodebench-pro

- Benchmark is defined in [`nemo_skills/dataset/livecodebench-pro/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench-pro/__init__.py)
Expand Down
54 changes: 26 additions & 28 deletions nemo_skills/dataset/livecodebench/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from datetime import datetime
from pathlib import Path

from datasets import load_dataset
from datasets import Value, load_dataset
from dateutil.relativedelta import relativedelta


Expand Down Expand Up @@ -66,7 +66,7 @@ def parse_month_range(start_date, end_date):
raise ValueError(str(e))


def clean_data(dataset):
def clean_data(dataset, keep_all_columns=False):
def map_fn(data):
question = data["question_content"] + "\n\n"
if data["starter_code"]:
Expand All @@ -80,22 +80,26 @@ def map_fn(data):
data["question"] = question.replace(" ", "\t")
return data

remove_columns = [
"question_title",
"contest_id",
"public_test_cases",
"private_test_cases",
"metadata",
"question_content",
"platform",
"question_id",
"starter_code",
]
remove_columns = []
if not keep_all_columns:
remove_columns = [
"question_title",
"contest_id",
"metadata",
"question_content",
"platform",
"question_id",
"starter_code",
"public_test_cases",
"private_test_cases",
]
dataset = dataset.cast_column("public_test_cases", Value("large_string"))
dataset = dataset.cast_column("private_test_cases", Value("large_string"))
dataset = dataset.map(map_fn, remove_columns=remove_columns)
return dataset


def prepare(start_date, end_date, release_version, output_dir):
def prepare(start_date, end_date, release_version, output_dir, keep_all_columns=False):
start_date, end_date = parse_month_range(start_date, end_date)
start_yymm = start_date.strftime("%y%m")
end_yymm = end_date.strftime("%y%m")
Expand All @@ -104,7 +108,7 @@ def prepare(start_date, end_date, release_version, output_dir):
assert release_version in ["v1", "v2", "v3", "v4", "v5", "v6"]

data = parse_data(release_version=f"release_{release_version}")
data = clean_data(data)
data = clean_data(data, keep_all_columns)
print("Len of data: ", len(data))

print("Writing to file...")
Expand All @@ -115,16 +119,10 @@ def prepare(start_date, end_date, release_version, output_dir):
for problem in data:
input_date = datetime.strptime(problem["contest_date"], "%Y-%m-%dT%H:%M:%S").date()
if start_date <= input_date <= end_date:
json.dump(
{
"task_id": problem["task_id"],
"question": problem["question"],
"difficulty": problem["difficulty"],
"subset_for_metrics": problem["difficulty"],
"release_version": release_version,
},
f,
)
output_record = {**problem}
output_record["subset_for_metrics"] = problem["difficulty"]
output_record["release_version"] = release_version
json.dump(output_record, f)
f.write("\n")


Expand All @@ -135,29 +133,29 @@ def prepare(start_date, end_date, release_version, output_dir):
("v6", "2024-08", "2025-05"), # current default in lb
]


if __name__ == "__main__":
# Write an argparse to a json file, read it in and parse it
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, default=str(Path(__file__).parent))
parser.add_argument("--release_version", type=str, default="all")
parser.add_argument("--start_date", type=str, default="all", help="End date in YYYY-MM format")
parser.add_argument("--end_date", type=str, default="all", help="End date in YYYY-MM format")
parser.add_argument("--keep_all_columns", action="store_true", help="keep all columns in the output jsonl file")

args = parser.parse_args()

if args.release_version == "all" and args.start_date == "all" and args.end_date == "all":
# Prepare all splits
for release_version, start_date, end_date in DEFAULT_SPLITS:
print(f"Processing data for {release_version} from {start_date} to {end_date}")
prepare(start_date, end_date, release_version, args.output_dir)
prepare(start_date, end_date, release_version, args.output_dir, args.keep_all_columns)
else:
if args.release_version == "all" or args.start_date == "all" or args.end_date == "all":
raise ValueError(
"If preparing a custom split, you must specify all "
"--release_version, --start_date, and --end_date arguments."
)
prepare(args.start_date, args.end_date, args.release_version, args.output_dir)
prepare(args.start_date, args.end_date, args.release_version, args.output_dir, args.keep_all_columns)

# test_v5_2408_2502.jsonl: 279 samples
# test_v5_2410_2502.jsonl: 166 samples
Expand Down
2 changes: 1 addition & 1 deletion nemo_skills/evaluation/evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
eval_bigcodebench,
eval_evalplus,
eval_livebench_coding,
eval_livecodebench,
eval_livecodebench_pro,
)
from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench
from nemo_skills.evaluation.evaluator.ifeval import eval_if
from nemo_skills.evaluation.evaluator.ioi import eval_ioi
from nemo_skills.evaluation.evaluator.livecodebench import eval_livecodebench
from nemo_skills.evaluation.evaluator.math import (
Lean4ProofEvaluator,
Lean4StatementEvaluator,
Expand Down
97 changes: 16 additions & 81 deletions nemo_skills/evaluation/evaluator/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@

from omegaconf import OmegaConf

from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
from nemo_skills.file_utils import unroll_files
from nemo_skills.utils import get_logger_name

LOG = logging.getLogger(get_logger_name(__file__))

Expand Down Expand Up @@ -99,86 +100,6 @@ def install_from_git(git_url):
print(f"Error during installation: {e}")


# TODO: use sandbox
@nested_dataclass(kw_only=True)
class LiveCodeBenchEvaluatorConfig:
language: str = "python" # "cpp" is another option now
test_file: str = None


def eval_livecodebench(cfg):
try:
from livecodebench.evaluate import evaluate
except ImportError:
LOG.info("Package 'livecodebench' not found. Attempting to install...")
# install_from_git("git+https://github.com/wasiahmad/livecodebench.git")
install_from_git("git+https://github.com/wasiahmad/livecodebench.git@f285640c20aaf18df1ee5917621a596af4630b5e")
try:
from livecodebench.evaluate import evaluate
except ImportError:
LOG.info("Failed to install 'livecodebench'. Please install it manually.")
raise

eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config)
assert eval_config.language in ["python", "cpp"]
if eval_config.language == "cpp":
assert eval_config.test_file is not None

release_version = None
for jsonl_file in unroll_files(cfg.input_files):
with open(jsonl_file) as f:
samples = [preprocess_code(json.loads(line), eval_config.language) for line in f]
for sample in samples:
sample["question_id"] = sample["task_id"]
sample["code_list"] = [sample["completion"]]
if release_version is None:
release_version = sample["release_version"]
if release_version != sample["release_version"]:
raise ValueError(
f"All samples should have the same release version, "
f"but got {release_version} and {sample['release_version']}"
)

with open(jsonl_file, "wt", encoding="utf-8") as f:
for sample in samples:
f.write(json.dumps(sample) + "\n")

# https://github.com/wasiahmad/livecodebench/blob/main/livecodebench/evaluate.py#L10
evaluate(
custom_output_file=jsonl_file,
release_version=f"release_{release_version}",
k_list=[1],
language=eval_config.language,
test_file=None if eval_config.language == "python" else eval_config.test_file,
num_process_evaluate=12,
timeout=6 if eval_config.language == "python" else 30,
)

with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
eval_grades = json.load(fin)
with open(jsonl_file, "wt", encoding="utf-8") as f:
for sample in samples:
sample["graded_list"] = eval_grades["eval"][sample["task_id"]]["graded_list"]
f.write(json.dumps(sample) + "\n")

# moving eval file to ensure metrics are recomputed
shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")


def eval_livecodebench_pro(cfg):
for jsonl_file in unroll_files(cfg.input_files):
with open(jsonl_file) as f:
samples = [preprocess_code(json.loads(line), "python") for line in f]
for sample in samples:
sample["problem_id"] = sample.pop("task_id")
sample["text_response"] = sample.pop("completion")
sample["response_meta"] = None

with open(jsonl_file, "wt", encoding="utf-8") as f:
for sample in samples:
f.write(json.dumps(sample) + "\n")


def eval_evalplus(cfg):
# TODO: need to move it to a separate docker (either our sandbox or separate srun)
from evalplus.evaluate import evaluate
Expand Down Expand Up @@ -228,6 +149,20 @@ def install_requirements(url):
print(f"Error during installation: {e}")


def eval_livecodebench_pro(cfg):
for jsonl_file in unroll_files(cfg.input_files):
with open(jsonl_file) as f:
samples = [preprocess_code(json.loads(line), "python") for line in f]
for sample in samples:
sample["problem_id"] = sample.pop("task_id")
sample["text_response"] = sample.pop("completion")
sample["response_meta"] = None

with open(jsonl_file, "wt", encoding="utf-8") as f:
for sample in samples:
f.write(json.dumps(sample) + "\n")


def eval_livebench_coding(cfg):
try:
from livecodebench.evaluate import evaluate
Expand Down
Loading