NVIDIA-NeMo · Kipok · Oct 24, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,10 +31,10 @@ repos:
         exclude: ^mkdocs\.yml$
       - id: detect-private-key
       - id: end-of-file-fixer
-        exclude: docs/|\.txt$|\.patch$
+        exclude: docs/|\.txt$|\.patch$|test$
       - id: requirements-txt-fixer
       - id: trailing-whitespace
-        exclude: \.txt$|\.patch$
+        exclude: \.txt$|\.patch$|test$
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.12.9

diff --git a/dockerfiles/Dockerfile.sandbox b/dockerfiles/Dockerfile.sandbox
@@ -33,9 +33,6 @@ RUN apt-get update && \
     rm /tmp/pypy.tar.bz2 && \
     rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 
-# Install the DMOJ judge-server using pip (OJBench eval requirement)
-RUN pip install git+https://github.com/DMOJ/judge-server.git@11bf2cd03df83f0df5970a08e98b4cec2dfaecd5
-
 # Install Lean 4 toolchain
 RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \
     /root/.elan/bin/elan toolchain install leanprover/lean4:v4.12.0 && \

diff --git a/dockerfiles/ifbench.patch b/dockerfiles/ifbench.patch
@@ -3,13 +3,13 @@ index d3071c7..e303de7 100644
 --- a/instructions.py
 +++ b/instructions.py
 @@ -31,7 +31,9 @@ import io
- 
+
  import instructions_util
- 
+
 -download('en_core_web_sm')
-++# assumed to be predownloaded
-++print("skipping download of en_core_web_sm")
++# assumed to be predownloaded
++print("skipping download of en_core_web_sm")
 +# download('en_core_web_sm')
- 
+
  logger = logging.getLogger(__name__)
- 
+
diff --git a/docs/agentic_inference/parallel_thinking.md b/docs/agentic_inference/parallel_thinking.md
@@ -57,6 +57,7 @@ ns eval \
   --server_gpus 2 \
   --server_type vllm \
   --output_dir /experiments/qwen3_8b/gensynthesis \
+  ++parse_reasoning=True \
   ++inference.tokens_to_generate=16384 \
   ++parallel_thinking.mode=gensynthesis \
   ++server.enable_soft_fail=True \
@@ -91,6 +92,7 @@ eval(
     ctx=wrap_arguments(
         "++inference.tokens_to_generate=16384 "
         "++inference.temperature=0.6 "
+        "++parse_reasoning=True "
     ),
     cluster="local",
     benchmarks="livecodebench:8",
@@ -110,6 +112,7 @@ eval(
         "++parallel_thinking.mode=genselect "
         "++parallel_thinking.solution_key=completion "
         "++parallel_thinking.generation_dir=/workspace/qwen3_4b_evals/eval-results/livecodebench "
+        "++parse_reasoning=True "
     ),
     cluster="local",
     benchmarks="livecodebench:8",

diff --git a/docs/basics/index.md b/docs/basics/index.md
@@ -267,7 +267,8 @@ run_cmd( # (1)!
 eval(
     ctx=wrap_arguments( # (2)!
         "++inference.tokens_to_generate=16000 "
-        "++inference.temperature=0.6"
+        "++inference.temperature=0.6 "
+        "++parse_reasoning=True "
     ),
     cluster=cluster,
     model=f"{output_dir}/QwQ-32B",

diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -267,17 +267,18 @@ ns eval \
     --split=test_v6_2408_2505 \
     --data_dir=<DATA_DIR> \
     --output_dir=<OUTPUT_DIR> \
-    --extra_eval_args="++eval_config.interpreter=python" \
+    ++parse_reasoning=True \
+    ++eval_config.interpreter=python \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
     ++inference.tokens_to_generate=65536
 ```
 
 ##### Pypy3 Evaluation
 
-To run with the Pypy3 interpreter, we need to use sandbox. Therefore, pass these flags `--with_sandbox --keep_mounts_for_sandbox` and modify the `--extra_eval_args` flag as shown below.
+To run with the Pypy3 interpreter, we need to use sandbox. Therefore, pass these flags `--with_sandbox --keep_mounts_for_sandbox` and also add the following arguments
 ```
---extra_eval_args="++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl"
+++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl
 ```
 
 ##### Verifying Results
@@ -345,68 +346,6 @@ Due to variance between runs, you can automatically repeat the evaluation and av
 - Benchmark is defined in [`nemo_skills/dataset/livebench-coding/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/livebench-coding/__init__.py)
 - Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding).
 
-### OJBench
-
-- Benchmark is defined in [`nemo_skills/dataset/ojbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/ojbench/__init__.py)
-- Original benchmark source is [here](https://github.com/He-Ren/OJBench/tree/main).
-
-#### Data preparation
-
-Before running ns eval, you will need to prepare the data with this command:
-
-```
-ns prepare_data --data_dir=<DATA_DIR> --cluster=<CLUSTER_NAME> ojbench
-```
-
-We encourage to download OJBench data into a Slurm cluster location because 15GB data will be downloaded by cloning [huggingface.co/datasets/He-Ren/OJBench_testdata](https://huggingface.co/datasets/He-Ren/OJBench_testdata). Two files will be created at `<DATA_DIR>` named `test_python.jsonl` and `test_cpp.jsonl`. Note that, data downloading require `HF_TOKEN` to be in the environment variables.
-
-#### Sample run
-
-Here's how to run a sample evaluation of [Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) on a Slurm cluster.
-
-1. Prepare the data following instructions in the previous section.
-2. Run
-```
-ns eval \
-    --cluster=<CLUSTER_NAME> \
-    --model=Qwen/Qwen3-32B \
-    --server_type=vllm \
-    --server_nodes=1 \
-    --server_gpus=8 \
-    --benchmarks=ojbench \
-    --split=test_python \
-    --data_dir=<DATA_DIR> \
-    --output_dir=<OUTPUT_DIR> \
-    ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++inference.tokens_to_generate=32768
-```
-replacing <...> with your desired parameters.
-
-After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/ojbench/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/ojbench/summarized-results/main_*` They should look something like this:
-```
------------------------------ ojbench -----------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 232         | 19628      | 2201        | 27.16%
-
-
---------------------------- ojbench-easy --------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 36          | 12052      | 1729        | 72.22%
-
-
---------------------------- ojbench-hard --------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 117         | 22585      | 2191        | 5.13%
-
-
--------------------------- ojbench-medium -------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 79          | 18701      | 2201        | 39.24%
-```
-
-Keep in mind there is some variance between runs, so we recommend running evaluation multiple times and averaging out the resolve rate. To do that automatically, you can set `--benchmarks=ojbench:N`, where N is your desired number of repeats.
-
 ### human-eval-infilling
 
 - Benchmark is defined in [`nemo_skills/dataset/human-eval-infilling/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/human-eval-infilling/__init__.py)

diff --git a/docs/evaluation/index.md b/docs/evaluation/index.md
@@ -50,6 +50,10 @@ ns prepare_data ruler --setup=llama_128k --tokenizer_path=meta-llama/Llama-3.1-8
 
 ## Running evaluation
 
+!!! warning
+    For correct evaluation of reasoning models, either provide reasoning parser in server args (e.g. `--server_args="--reasoning-parser ..."` for vllm)
+    or set `++parse_reasoning=True` as well as an appropriate `++end_reasoning_string` string (which defaults to `</think>`).
+
 ```bash
 ns eval \
     --cluster=local \
@@ -166,7 +170,7 @@ Different benchmarks have different evaluation options that you can customize. H
 code execution timeout for scicode benchmark
 
 ```bash
-    --extra_eval_args="++eval_config.timeout=60"
+    ++eval_config.timeout=60
 ```
 
 ## Using data on cluster
@@ -223,14 +227,11 @@ Inside [`nemo_skills/dataset/gsm8k/__init__.py`](https://github.com/NVIDIA-NeMo/
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++eval_type=math ++prompt_config=generic/math"
 ```
 
 The prompt config and default generation arguments are passed to the
-[nemo_skills/inference/generate.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/inference/generate.py) and
-the default eval args are passed to the
-[nemo_skills/evaluation/evaluate_results.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluate_results.py).
+[nemo_skills/inference/generate.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/inference/generate.py).
 The dataset group is used by [nemo_skills/dataset/prepare.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/prepare.py)
 to help download only benchmarks from a particular group if `--dataset_groups` parameter is used.
 Finally, the metrics type is used to pick a metrics class from [nemo_skills/evaluation/metrics/map_metrics.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/metrics/map_metrics.py)

diff --git a/docs/evaluation/multilingual.md b/docs/evaluation/multilingual.md
@@ -15,14 +15,14 @@ Once prepared, the `ns eval` command will run on all languages prepared, and the
 Our evaluation template and answer extraction mechanism tries to match the configration in [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/mmlu_prox).
 Some reference numbers for reference and commands for reproduction:
 
-| Model              | Type   |   en | de   | es   | fr   | it   | ja   |
-|:-------------------|:-------|-----:|:-----|:-----|:-----|:-----|:-----|
-| gpt-oss-120b       | Public       | 80.8 | -    | -    | -    | -    | -    |
-| gpt-oss-120b       | Nemo-Skills  | 75.5 | 71.8 | 73.4 | 70.9 | 71.7 | 66.7 |
-| mistral-3.1-small  | Public       | 62   | 58.5 | 59.4 | 60.6 | 59.6 | 54.4 |
-| mistral-3.1-small  | Nemo-Skills  | 67.6 | 59.9 | 63.7 | 63.2 | 63.6 | 56.6 |
-| qwen3-32b-thinking | Public       | 74.9 | 71.7 | 72.8 | 72.1 | 73.5 | 70.2 |
-| qwen3-32b-thinking | Nemo-Skills  | 72.7 | 70.4 | 74.0 | 73.7 | 76.3 | 73.9 |
+| Model              | Type        |   en | de   | es   | fr   | it   | ja   |
+| :----------------- | :---------- | ---: | :--- | :--- | :--- | :--- | :--- |
+| gpt-oss-120b       | Public      | 80.8 | -    | -    | -    | -    | -    |
+| gpt-oss-120b       | Nemo-Skills | 75.5 | 71.8 | 73.4 | 70.9 | 71.7 | 66.7 |
+| mistral-3.1-small  | Public      |   62 | 58.5 | 59.4 | 60.6 | 59.6 | 54.4 |
+| mistral-3.1-small  | Nemo-Skills | 67.6 | 59.9 | 63.7 | 63.2 | 63.6 | 56.6 |
+| qwen3-32b-thinking | Public      | 74.9 | 71.7 | 72.8 | 72.1 | 73.5 | 70.2 |
+| qwen3-32b-thinking | Nemo-Skills | 72.7 | 70.4 | 74.0 | 73.7 | 76.3 | 73.9 |
 
 === "GPT-OSS-120B"
 
@@ -65,6 +65,7 @@ Some reference numbers for reference and commands for reproduction:
         --server_type=vllm \
         --num_chunks=32 \
         --server_gpus=2 \
+        ++parse_reasoning=True \
         ++inference.temperature=0.6 \
         ++inference.top_k=20 \
         ++inference.tokens_to_generate=38912
@@ -77,12 +78,12 @@ Some reference numbers for reference and commands for reproduction:
 
 Some reference numbers for devtest split (xx corresponds to average over 5 languages: de, es, fr, it, ja):
 
-| Model                  | en->xx | xx->en | xx->xx |
-|:-----------------------|------:|------:|------:|
-| Nemotron-NanoV2-9B-v2  | 32.5 |  34  | 25.9 |
-| Qwen3-8B               | 31.5 | 34.6 | 25.7 |
-| Qwen3-30B-A3B          | 33.3 | 35.5 | 27.1 |
-| gpt-oss-20B            | 32.4 | 34.1 |  25  |
+| Model                 | en->xx | xx->en | xx->xx |
+| :-------------------- | -----: | -----: | -----: |
+| Nemotron-NanoV2-9B-v2 |   32.5 |     34 |   25.9 |
+| Qwen3-8B              |   31.5 |   34.6 |   25.7 |
+| Qwen3-30B-A3B         |   33.3 |   35.5 |   27.1 |
+| gpt-oss-20B           |   32.4 |   34.1 |     25 |
 
 === "Nemotron-NanoV2-9B-v2"
 
@@ -150,12 +151,12 @@ Some reference numbers for devtest split (xx corresponds to average over 5 langu
 
 Some reference numbers for test split (xx corresponds to average over 5 languages: de, es, fr, it, ja):
 
-| Model                  | en->de | en->es | en->fr | en->it | en->ja | en->xx |
-|:-----------------------|------:|------:|------:|------:|------:|------:|
-| Nemotron-NanoV2-9B-v2  | 25.3 | 37.7 | 33.4 | 33.8 | 20.9 |  30.2  |
-| Qwen3-8B               | 26.2 | 38.5 | 33.1 | 33.1 | 21.7 | 30.5 |
-| Qwen3-30B-A3B          | 28.5 |  40  | 35.1 |  36  | 23.2 | 32.5 |
-| gpt-oss-20B            | 27.3 | 42.3 | 32.8 | 34.9 | 25.2 | 32.5 |
+| Model                 | en->de | en->es | en->fr | en->it | en->ja | en->xx |
+| :-------------------- | -----: | -----: | -----: | -----: | -----: | -----: |
+| Nemotron-NanoV2-9B-v2 |   25.3 |   37.7 |   33.4 |   33.8 |   20.9 |   30.2 |
+| Qwen3-8B              |   26.2 |   38.5 |   33.1 |   33.1 |   21.7 |   30.5 |
+| Qwen3-30B-A3B         |   28.5 |     40 |   35.1 |     36 |   23.2 |   32.5 |
+| gpt-oss-20B           |   27.3 |   42.3 |   32.8 |   34.9 |   25.2 |   32.5 |
 
 === "Nemotron-NanoV2-9B-v2"
 

diff --git a/docs/evaluation/natural-math.md b/docs/evaluation/natural-math.md
@@ -11,7 +11,7 @@ We also support arbitrary regex based extraction. E.g., if you use a custom prom
 at the end of the solution, you can use these parameters to match the extraction logic to that prompt
 
 ```bash
-    --extra_eval_args="++eval_config.extract_from_boxed=False ++eval_config.extract_regex='Final answer: (.+)$'"
+    ++eval_config.extract_from_boxed=False ++eval_config.extract_regex='Final answer: (.+)$'
 ```
 
 !!! warning
@@ -159,9 +159,9 @@ In either case you can always customize the judge prompt by setting a new `++pro
 - Benchmark is defined in [`nemo_skills/dataset/omni-math/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/omni-math/__init__.py)
 - Original benchmark source is [here](https://omni-math.github.io/).
 
-### math
+### hendrycks_math
 
-- Benchmark is defined in [`nemo_skills/dataset/math/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/math/__init__.py)
+- Benchmark is defined in [`nemo_skills/dataset/hendrycks_math/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/hendrycks_math/__init__.py)
 - Original benchmark source is [here](https://github.com/hendrycks/math).
 
 ### math-500

diff --git a/docs/evaluation/robustness.md b/docs/evaluation/robustness.md
@@ -28,10 +28,12 @@ Note that every prompt is a separate job, and all parameters are shared for all
 ```python
 from nemo_skills.pipeline.cli import wrap_arguments, robust_eval
 robust_eval(ctx=wrap_arguments(
-        f"++inference.temperature=0.6 "
-        f"++inference.top_p=0.95 "
+        "++inference.temperature=0.6 "
+        "++inference.top_p=0.95 "
+        "++parse_reasoning=True "
     ),
-    prompt_set_config='robustness/prompt_set_config', # OR nemo_skills/prompt/config/robutness/prompt_set_config OR absolute path to .yaml file
+    # OR nemo_skills/prompt/config/robutness/prompt_set_config OR absolute path to .yaml file
+    prompt_set_config='robustness/prompt_set_config',
     cluster=cluster_config,
     model="Qwen/Qwen3-8B",
     server_type='vllm',

diff --git a/docs/evaluation/tool-calling.md b/docs/evaluation/tool-calling.md
@@ -62,6 +62,7 @@ ns eval \
   --server_gpus 2 \
   --server_type vllm \
   --output_dir /workspace/qwen3-4b-client-parsing/ \
+  ++parse_reasoning=True \
   ++inference.tokens_to_generate=8192 \
   ++model_name=Qwen/Qwen3-4B-FC \
 ```
@@ -87,10 +88,11 @@ The following command evaluates the `Qwen3-4B` model which uses a standard tool-
 ns eval \
   --benchmarks bfcl_v3 \
   --cluster dfw \
-  --model /hf_models/Qwen3-4B \
+  --model Qwen/Qwen3-4B \
   --server_gpus 2 \
   --server_type vllm \
   --output_dir /workspace/qwen3-4b-server-parsing/ \
+  ++parse_reasoning=True \
   ++inference.tokens_to_generate=8192 \
   ++use_client_parsing=False \
   --server_args="--enable-auto-tool-choice --tool-call-parser hermes"
@@ -110,6 +112,7 @@ ns eval \
     --server_gpus=2 \
     --server_type=vllm \
     --output_dir=/workspace/llama_nemotron_49b_1_5_tool_calling/ \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
@@ -122,11 +125,11 @@ ns eval \
 
 ### Configuration Parameters
 
-| Configuration | True | False |
-|---------------|------|-------|
-| `++use_client_parsing` | Default | - |
-| `++model_name` | Required for client parsing | - |
-| `--server_args` | - | Required for server-side parsing |
+| Configuration          | True                        | False                            |
+| ---------------------- | --------------------------- | -------------------------------- |
+| `++use_client_parsing` | Default                     | -                                |
+| `++model_name`         | Required for client parsing | -                                |
+| `--server_args`        | -                           | Required for server-side parsing |
 
 
 

diff --git a/docs/pipelines/decontamination.md b/docs/pipelines/decontamination.md
@@ -19,7 +19,7 @@ Let's say you want to check for contamination of [MATH](https://github.com/hendr
 training set with MATH, AMC-23 and AIME-24 test sets. First, get the data
 
 ```bash
-ns prepare_data math amc23 aime24
+ns prepare_data hendrycks_math amc23 aime24
 ```
 
 Then we need to retrieve top-k similar questions from the training set. Assuming
@@ -30,12 +30,12 @@ you can do it in the following way
 from nemo_skills.pipeline.cli import wrap_arguments, run_cmd, generate
 
 
-test_sets = ['math', 'amc23', 'aime24']
+test_sets = ['hendrycks_math', 'amc23', 'aime24']
 compare_to = ",".join(f"/nemo_run/code/nemo_skills/dataset/{test_set}/test.jsonl" for test_set in test_sets)
 
 cmd = (
     f"python -m nemo_skills.inference.retrieve_similar "
-    f"    ++retrieve_from='/nemo_run/code/nemo_skills/dataset/math/train.jsonl' "
+    f"    ++retrieve_from='/nemo_run/code/nemo_skills/dataset/hendrycks_math/train.jsonl' "
     f"    ++compare_to=\\\'{compare_to}\\\'"
     f"    ++output_file='/workspace/math-contamination-retrieved.jsonl' "
     f"    ++top_k=1 "