diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6aa6d7c0bb..2eabe2d3e4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,10 +31,10 @@ repos:
         exclude: ^mkdocs\.yml$
       - id: detect-private-key
       - id: end-of-file-fixer
-        exclude: docs/|\.txt$|\.patch$
+        exclude: docs/|\.txt$|\.patch$|test$
       - id: requirements-txt-fixer
       - id: trailing-whitespace
-        exclude: \.txt$|\.patch$
+        exclude: \.txt$|\.patch$|test$
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.12.9
diff --git a/dockerfiles/Dockerfile.sandbox b/dockerfiles/Dockerfile.sandbox
index 2b0043d65c..735ff6777c 100644
--- a/dockerfiles/Dockerfile.sandbox
+++ b/dockerfiles/Dockerfile.sandbox
@@ -33,9 +33,6 @@ RUN apt-get update && \
     rm /tmp/pypy.tar.bz2 && \
     rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 
-# Install the DMOJ judge-server using pip (OJBench eval requirement)
-RUN pip install git+https://github.com/DMOJ/judge-server.git@11bf2cd03df83f0df5970a08e98b4cec2dfaecd5
-
 # Install Lean 4 toolchain
 RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \
     /root/.elan/bin/elan toolchain install leanprover/lean4:v4.12.0 && \
diff --git a/dockerfiles/ifbench.patch b/dockerfiles/ifbench.patch
index 74426fcd86..510f8a1c01 100644
--- a/dockerfiles/ifbench.patch
+++ b/dockerfiles/ifbench.patch
@@ -3,13 +3,13 @@ index d3071c7..e303de7 100644
 --- a/instructions.py
 +++ b/instructions.py
 @@ -31,7 +31,9 @@ import io
- 
+
  import instructions_util
- 
+
 -download('en_core_web_sm')
-++# assumed to be predownloaded
-++print("skipping download of en_core_web_sm")
++# assumed to be predownloaded
++print("skipping download of en_core_web_sm")
 +# download('en_core_web_sm')
- 
+
  logger = logging.getLogger(__name__)
- 
+
diff --git a/docs/agentic_inference/parallel_thinking.md b/docs/agentic_inference/parallel_thinking.md
index 67911be3d5..5e3e8ab512 100644
--- a/docs/agentic_inference/parallel_thinking.md
+++ b/docs/agentic_inference/parallel_thinking.md
@@ -57,6 +57,7 @@ ns eval \
   --server_gpus 2 \
   --server_type vllm \
   --output_dir /experiments/qwen3_8b/gensynthesis \
+  ++parse_reasoning=True \
   ++inference.tokens_to_generate=16384 \
   ++parallel_thinking.mode=gensynthesis \
   ++server.enable_soft_fail=True \
@@ -91,6 +92,7 @@ eval(
     ctx=wrap_arguments(
         "++inference.tokens_to_generate=16384 "
         "++inference.temperature=0.6 "
+        "++parse_reasoning=True "
     ),
     cluster="local",
     benchmarks="livecodebench:8",
@@ -110,6 +112,7 @@ eval(
         "++parallel_thinking.mode=genselect "
         "++parallel_thinking.solution_key=completion "
         "++parallel_thinking.generation_dir=/workspace/qwen3_4b_evals/eval-results/livecodebench "
+        "++parse_reasoning=True "
     ),
     cluster="local",
     benchmarks="livecodebench:8",
diff --git a/docs/basics/index.md b/docs/basics/index.md
index 436496c104..4dba8e308d 100644
--- a/docs/basics/index.md
+++ b/docs/basics/index.md
@@ -267,7 +267,8 @@ run_cmd( # (1)!
 eval(
     ctx=wrap_arguments( # (2)!
         "++inference.tokens_to_generate=16000 "
-        "++inference.temperature=0.6"
+        "++inference.temperature=0.6 "
+        "++parse_reasoning=True "
     ),
     cluster=cluster,
     model=f"{output_dir}/QwQ-32B",
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
index 2c4ef23ec6..4353bbb59d 100644
--- a/docs/evaluation/code.md
+++ b/docs/evaluation/code.md
@@ -267,7 +267,8 @@ ns eval \
     --split=test_v6_2408_2505 \
     --data_dir=<DATA_DIR> \
     --output_dir=<OUTPUT_DIR> \
-    --extra_eval_args="++eval_config.interpreter=python" \
+    ++parse_reasoning=True \
+    ++eval_config.interpreter=python \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
     ++inference.tokens_to_generate=65536
@@ -275,9 +276,9 @@ ns eval \
 
 ##### Pypy3 Evaluation
 
-To run with the Pypy3 interpreter, we need to use sandbox. Therefore, pass these flags `--with_sandbox --keep_mounts_for_sandbox` and modify the `--extra_eval_args` flag as shown below.
+To run with the Pypy3 interpreter, we need to use sandbox. Therefore, pass these flags `--with_sandbox --keep_mounts_for_sandbox` and also add the following arguments
 ```
---extra_eval_args="++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl"
+++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl
 ```
 
 ##### Verifying Results
@@ -345,68 +346,6 @@ Due to variance between runs, you can automatically repeat the evaluation and av
 - Benchmark is defined in [`nemo_skills/dataset/livebench-coding/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/livebench-coding/__init__.py)
 - Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding).
 
-### OJBench
-
-- Benchmark is defined in [`nemo_skills/dataset/ojbench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/ojbench/__init__.py)
-- Original benchmark source is [here](https://github.com/He-Ren/OJBench/tree/main).
-
-#### Data preparation
-
-Before running ns eval, you will need to prepare the data with this command:
-
-```
-ns prepare_data --data_dir=<DATA_DIR> --cluster=<CLUSTER_NAME> ojbench
-```
-
-We encourage to download OJBench data into a Slurm cluster location because 15GB data will be downloaded by cloning [huggingface.co/datasets/He-Ren/OJBench_testdata](https://huggingface.co/datasets/He-Ren/OJBench_testdata). Two files will be created at `<DATA_DIR>` named `test_python.jsonl` and `test_cpp.jsonl`. Note that, data downloading require `HF_TOKEN` to be in the environment variables.
-
-#### Sample run
-
-Here's how to run a sample evaluation of [Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) on a Slurm cluster.
-
-1. Prepare the data following instructions in the previous section.
-2. Run
-```
-ns eval \
-    --cluster=<CLUSTER_NAME> \
-    --model=Qwen/Qwen3-32B \
-    --server_type=vllm \
-    --server_nodes=1 \
-    --server_gpus=8 \
-    --benchmarks=ojbench \
-    --split=test_python \
-    --data_dir=<DATA_DIR> \
-    --output_dir=<OUTPUT_DIR> \
-    ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++inference.tokens_to_generate=32768
-```
-replacing <...> with your desired parameters.
-
-After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/ojbench/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/ojbench/summarized-results/main_*` They should look something like this:
-```
------------------------------ ojbench -----------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 232         | 19628      | 2201        | 27.16%
-
-
---------------------------- ojbench-easy --------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 36          | 12052      | 1729        | 72.22%
-
-
---------------------------- ojbench-hard --------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 117         | 22585      | 2191        | 5.13%
-
-
--------------------------- ojbench-medium -------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
-pass@1          | 79          | 18701      | 2201        | 39.24%
-```
-
-Keep in mind there is some variance between runs, so we recommend running evaluation multiple times and averaging out the resolve rate. To do that automatically, you can set `--benchmarks=ojbench:N`, where N is your desired number of repeats.
-
 ### human-eval-infilling
 
 - Benchmark is defined in [`nemo_skills/dataset/human-eval-infilling/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/human-eval-infilling/__init__.py)
diff --git a/docs/evaluation/index.md b/docs/evaluation/index.md
index 0bfe6b68b5..171d1de735 100644
--- a/docs/evaluation/index.md
+++ b/docs/evaluation/index.md
@@ -50,6 +50,10 @@ ns prepare_data ruler --setup=llama_128k --tokenizer_path=meta-llama/Llama-3.1-8
 
 ## Running evaluation
 
+!!! warning
+    For correct evaluation of reasoning models, either provide reasoning parser in server args (e.g. `--server_args="--reasoning-parser ..."` for vllm)
+    or set `++parse_reasoning=True` as well as an appropriate `++end_reasoning_string` string (which defaults to `</think>`).
+
 ```bash
 ns eval \
     --cluster=local \
@@ -166,7 +170,7 @@ Different benchmarks have different evaluation options that you can customize. H
 code execution timeout for scicode benchmark
 
 ```bash
-    --extra_eval_args="++eval_config.timeout=60"
+    ++eval_config.timeout=60
 ```
 
 ## Using data on cluster
@@ -223,14 +227,11 @@ Inside [`nemo_skills/dataset/gsm8k/__init__.py`](https://github.com/NVIDIA-NeMo/
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = 'math'
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++eval_type=math ++prompt_config=generic/math"
 ```
 
 The prompt config and default generation arguments are passed to the
-[nemo_skills/inference/generate.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/inference/generate.py) and
-the default eval args are passed to the
-[nemo_skills/evaluation/evaluate_results.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluate_results.py).
+[nemo_skills/inference/generate.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/inference/generate.py).
 The dataset group is used by [nemo_skills/dataset/prepare.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/prepare.py)
 to help download only benchmarks from a particular group if `--dataset_groups` parameter is used.
 Finally, the metrics type is used to pick a metrics class from [nemo_skills/evaluation/metrics/map_metrics.py](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/metrics/map_metrics.py)
diff --git a/docs/evaluation/multilingual.md b/docs/evaluation/multilingual.md
index a1ccea473a..526e986120 100644
--- a/docs/evaluation/multilingual.md
+++ b/docs/evaluation/multilingual.md
@@ -15,14 +15,14 @@ Once prepared, the `ns eval` command will run on all languages prepared, and the
 Our evaluation template and answer extraction mechanism tries to match the configration in [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/mmlu_prox).
 Some reference numbers for reference and commands for reproduction:
 
-| Model              | Type   |   en | de   | es   | fr   | it   | ja   |
-|:-------------------|:-------|-----:|:-----|:-----|:-----|:-----|:-----|
-| gpt-oss-120b       | Public       | 80.8 | -    | -    | -    | -    | -    |
-| gpt-oss-120b       | Nemo-Skills  | 75.5 | 71.8 | 73.4 | 70.9 | 71.7 | 66.7 |
-| mistral-3.1-small  | Public       | 62   | 58.5 | 59.4 | 60.6 | 59.6 | 54.4 |
-| mistral-3.1-small  | Nemo-Skills  | 67.6 | 59.9 | 63.7 | 63.2 | 63.6 | 56.6 |
-| qwen3-32b-thinking | Public       | 74.9 | 71.7 | 72.8 | 72.1 | 73.5 | 70.2 |
-| qwen3-32b-thinking | Nemo-Skills  | 72.7 | 70.4 | 74.0 | 73.7 | 76.3 | 73.9 |
+| Model              | Type        |   en | de   | es   | fr   | it   | ja   |
+| :----------------- | :---------- | ---: | :--- | :--- | :--- | :--- | :--- |
+| gpt-oss-120b       | Public      | 80.8 | -    | -    | -    | -    | -    |
+| gpt-oss-120b       | Nemo-Skills | 75.5 | 71.8 | 73.4 | 70.9 | 71.7 | 66.7 |
+| mistral-3.1-small  | Public      |   62 | 58.5 | 59.4 | 60.6 | 59.6 | 54.4 |
+| mistral-3.1-small  | Nemo-Skills | 67.6 | 59.9 | 63.7 | 63.2 | 63.6 | 56.6 |
+| qwen3-32b-thinking | Public      | 74.9 | 71.7 | 72.8 | 72.1 | 73.5 | 70.2 |
+| qwen3-32b-thinking | Nemo-Skills | 72.7 | 70.4 | 74.0 | 73.7 | 76.3 | 73.9 |
 
 === "GPT-OSS-120B"
 
@@ -65,6 +65,7 @@ Some reference numbers for reference and commands for reproduction:
         --server_type=vllm \
         --num_chunks=32 \
         --server_gpus=2 \
+        ++parse_reasoning=True \
         ++inference.temperature=0.6 \
         ++inference.top_k=20 \
         ++inference.tokens_to_generate=38912
@@ -77,12 +78,12 @@ Some reference numbers for reference and commands for reproduction:
 
 Some reference numbers for devtest split (xx corresponds to average over 5 languages: de, es, fr, it, ja):
 
-| Model                  | en->xx | xx->en | xx->xx |
-|:-----------------------|------:|------:|------:|
-| Nemotron-NanoV2-9B-v2  | 32.5 |  34  | 25.9 |
-| Qwen3-8B               | 31.5 | 34.6 | 25.7 |
-| Qwen3-30B-A3B          | 33.3 | 35.5 | 27.1 |
-| gpt-oss-20B            | 32.4 | 34.1 |  25  |
+| Model                 | en->xx | xx->en | xx->xx |
+| :-------------------- | -----: | -----: | -----: |
+| Nemotron-NanoV2-9B-v2 |   32.5 |     34 |   25.9 |
+| Qwen3-8B              |   31.5 |   34.6 |   25.7 |
+| Qwen3-30B-A3B         |   33.3 |   35.5 |   27.1 |
+| gpt-oss-20B           |   32.4 |   34.1 |     25 |
 
 === "Nemotron-NanoV2-9B-v2"
 
@@ -150,12 +151,12 @@ Some reference numbers for devtest split (xx corresponds to average over 5 langu
 
 Some reference numbers for test split (xx corresponds to average over 5 languages: de, es, fr, it, ja):
 
-| Model                  | en->de | en->es | en->fr | en->it | en->ja | en->xx |
-|:-----------------------|------:|------:|------:|------:|------:|------:|
-| Nemotron-NanoV2-9B-v2  | 25.3 | 37.7 | 33.4 | 33.8 | 20.9 |  30.2  |
-| Qwen3-8B               | 26.2 | 38.5 | 33.1 | 33.1 | 21.7 | 30.5 |
-| Qwen3-30B-A3B          | 28.5 |  40  | 35.1 |  36  | 23.2 | 32.5 |
-| gpt-oss-20B            | 27.3 | 42.3 | 32.8 | 34.9 | 25.2 | 32.5 |
+| Model                 | en->de | en->es | en->fr | en->it | en->ja | en->xx |
+| :-------------------- | -----: | -----: | -----: | -----: | -----: | -----: |
+| Nemotron-NanoV2-9B-v2 |   25.3 |   37.7 |   33.4 |   33.8 |   20.9 |   30.2 |
+| Qwen3-8B              |   26.2 |   38.5 |   33.1 |   33.1 |   21.7 |   30.5 |
+| Qwen3-30B-A3B         |   28.5 |     40 |   35.1 |     36 |   23.2 |   32.5 |
+| gpt-oss-20B           |   27.3 |   42.3 |   32.8 |   34.9 |   25.2 |   32.5 |
 
 === "Nemotron-NanoV2-9B-v2"
 
diff --git a/docs/evaluation/natural-math.md b/docs/evaluation/natural-math.md
index 03e05797d6..9d0aac2d4e 100644
--- a/docs/evaluation/natural-math.md
+++ b/docs/evaluation/natural-math.md
@@ -11,7 +11,7 @@ We also support arbitrary regex based extraction. E.g., if you use a custom prom
 at the end of the solution, you can use these parameters to match the extraction logic to that prompt
 
 ```bash
-    --extra_eval_args="++eval_config.extract_from_boxed=False ++eval_config.extract_regex='Final answer: (.+)$'"
+    ++eval_config.extract_from_boxed=False ++eval_config.extract_regex='Final answer: (.+)$'
 ```
 
 !!! warning
@@ -159,9 +159,9 @@ In either case you can always customize the judge prompt by setting a new `++pro
 - Benchmark is defined in [`nemo_skills/dataset/omni-math/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/omni-math/__init__.py)
 - Original benchmark source is [here](https://omni-math.github.io/).
 
-### math
+### hendrycks_math
 
-- Benchmark is defined in [`nemo_skills/dataset/math/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/math/__init__.py)
+- Benchmark is defined in [`nemo_skills/dataset/hendrycks_math/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/hendrycks_math/__init__.py)
 - Original benchmark source is [here](https://github.com/hendrycks/math).
 
 ### math-500
diff --git a/docs/evaluation/robustness.md b/docs/evaluation/robustness.md
index e726bd1903..2c6a80c576 100644
--- a/docs/evaluation/robustness.md
+++ b/docs/evaluation/robustness.md
@@ -28,10 +28,12 @@ Note that every prompt is a separate job, and all parameters are shared for all
 ```python
 from nemo_skills.pipeline.cli import wrap_arguments, robust_eval
 robust_eval(ctx=wrap_arguments(
-        f"++inference.temperature=0.6 "
-        f"++inference.top_p=0.95 "
+        "++inference.temperature=0.6 "
+        "++inference.top_p=0.95 "
+        "++parse_reasoning=True "
     ),
-    prompt_set_config='robustness/prompt_set_config', # OR nemo_skills/prompt/config/robutness/prompt_set_config OR absolute path to .yaml file
+    # OR nemo_skills/prompt/config/robutness/prompt_set_config OR absolute path to .yaml file
+    prompt_set_config='robustness/prompt_set_config',
     cluster=cluster_config,
     model="Qwen/Qwen3-8B",
     server_type='vllm',
diff --git a/docs/evaluation/tool-calling.md b/docs/evaluation/tool-calling.md
index 00543688d4..0aa5cab7a5 100644
--- a/docs/evaluation/tool-calling.md
+++ b/docs/evaluation/tool-calling.md
@@ -62,6 +62,7 @@ ns eval \
   --server_gpus 2 \
   --server_type vllm \
   --output_dir /workspace/qwen3-4b-client-parsing/ \
+  ++parse_reasoning=True \
   ++inference.tokens_to_generate=8192 \
   ++model_name=Qwen/Qwen3-4B-FC \
 ```
@@ -87,10 +88,11 @@ The following command evaluates the `Qwen3-4B` model which uses a standard tool-
 ns eval \
   --benchmarks bfcl_v3 \
   --cluster dfw \
-  --model /hf_models/Qwen3-4B \
+  --model Qwen/Qwen3-4B \
   --server_gpus 2 \
   --server_type vllm \
   --output_dir /workspace/qwen3-4b-server-parsing/ \
+  ++parse_reasoning=True \
   ++inference.tokens_to_generate=8192 \
   ++use_client_parsing=False \
   --server_args="--enable-auto-tool-choice --tool-call-parser hermes"
@@ -110,6 +112,7 @@ ns eval \
     --server_gpus=2 \
     --server_type=vllm \
     --output_dir=/workspace/llama_nemotron_49b_1_5_tool_calling/ \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
@@ -122,11 +125,11 @@ ns eval \
 
 ### Configuration Parameters
 
-| Configuration | True | False |
-|---------------|------|-------|
-| `++use_client_parsing` | Default | - |
-| `++model_name` | Required for client parsing | - |
-| `--server_args` | - | Required for server-side parsing |
+| Configuration          | True                        | False                            |
+| ---------------------- | --------------------------- | -------------------------------- |
+| `++use_client_parsing` | Default                     | -                                |
+| `++model_name`         | Required for client parsing | -                                |
+| `--server_args`        | -                           | Required for server-side parsing |
 
 
 
diff --git a/docs/pipelines/decontamination.md b/docs/pipelines/decontamination.md
index 52a643e5fb..c69c565ac5 100644
--- a/docs/pipelines/decontamination.md
+++ b/docs/pipelines/decontamination.md
@@ -19,7 +19,7 @@ Let's say you want to check for contamination of [MATH](https://github.com/hendr
 training set with MATH, AMC-23 and AIME-24 test sets. First, get the data
 
 ```bash
-ns prepare_data math amc23 aime24
+ns prepare_data hendrycks_math amc23 aime24
 ```
 
 Then we need to retrieve top-k similar questions from the training set. Assuming
@@ -30,12 +30,12 @@ you can do it in the following way
 from nemo_skills.pipeline.cli import wrap_arguments, run_cmd, generate
 
 
-test_sets = ['math', 'amc23', 'aime24']
+test_sets = ['hendrycks_math', 'amc23', 'aime24']
 compare_to = ",".join(f"/nemo_run/code/nemo_skills/dataset/{test_set}/test.jsonl" for test_set in test_sets)
 
 cmd = (
     f"python -m nemo_skills.inference.retrieve_similar "
-    f"    ++retrieve_from='/nemo_run/code/nemo_skills/dataset/math/train.jsonl' "
+    f"    ++retrieve_from='/nemo_run/code/nemo_skills/dataset/hendrycks_math/train.jsonl' "
     f"    ++compare_to=\\\'{compare_to}\\\'"
     f"    ++output_file='/workspace/math-contamination-retrieved.jsonl' "
     f"    ++top_k=1 "
diff --git a/docs/pipelines/generation.md b/docs/pipelines/generation.md
index 147c1f9592..b777b030bd 100644
--- a/docs/pipelines/generation.md
+++ b/docs/pipelines/generation.md
@@ -108,7 +108,7 @@ as an example.
 First, let's prepare the data if you have not done so yet.
 
 ```bash
-ns prepare_data math
+ns prepare_data hendrycks_math
 ```
 
 Then we can run the generation
@@ -122,8 +122,8 @@ ns generate \
        --server_nodes=2 \
        --num_random_seeds=32 \
        --output_dir=/workspace/synthetic-math-solutions \
-       --eval_args="++eval_type=math" \
        --input_file=/nemo_run/code/nemo_skills/dataset/math/train.jsonl \
+       ++eval_type=hendrycks_math \
        ++prompt_config=generic/math-base \
        ++examples_type=math_text_detailed \
        ++inference.endpoint_type=text \
@@ -133,13 +133,13 @@ ns generate \
 
 In this case we are assuming you're running on a slurm cluster and have downloaded Llama 3.1 405B.
 
-Note that in this case we use a path to one the train set of the "math" dataset which we prepared with previous command.
+Note that in this case we use a path to one the train set of the "hendrycks_math" dataset which we prepared with previous command.
 We are using a [generic/math](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/config/generic/math.yaml) config
 and a tokenizer for the base model
 (we found Llama 3.1 follows few-shots much better without chat tokens).
 Finally, we are specifying few shot examples which come from
 [here](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/prompt/few_shot_examples/math.py)
-and asking the script to evaluate the generated solutions by providing `--eval_args`.
+and asking the script to evaluate the generated solutions by providing `++eval_type=math`.
 
 An example prompt (printed by the generate script) for that job is below.
 
diff --git a/docs/pipelines/llm-as-a-judge.md b/docs/pipelines/llm-as-a-judge.md
index 13d6ccc276..6f78cfde15 100644
--- a/docs/pipelines/llm-as-a-judge.md
+++ b/docs/pipelines/llm-as-a-judge.md
@@ -11,7 +11,7 @@ While we do perform such comparison by default, for most accurate results it's b
 E.g. symbolic comparison can perform very inaccurately for multi-choice questions where an answer might either be
 one of the letters or an expression corresponding to that letter.
 
-If you have an output of the [evaluation script](evaluation.md) on e.g. math benchmark, you can run LLM-as-a-judge
+If you have an output of the [evaluation script](evaluation.md) on e.g. hendrycks_math benchmark, you can run LLM-as-a-judge
 in the following way (assuming you have `/workspace` mounted in your [cluster config](../basics/cluster-configs.md)
 and evaluation output available in `/workspace/test-eval/eval-results`).
 
@@ -22,14 +22,14 @@ ns generate \
     --model=gpt-4o \
     --server_type=openai \
     --server_address=https://api.openai.com/v1 \
-    --output_dir=/workspace/test-eval-judge/eval-results/math \
-    --input_dir=/workspace/test-eval/eval-results/math \
+    --output_dir=/workspace/test-eval-judge/eval-results/hendrycks_math \
+    --input_dir=/workspace/test-eval/eval-results/hendrycks_math \
     --num_random_seeds=<num seeds used for generation>
 ```
 
-This will run the judge pipeline on the data inside `eval-results/math` folder and judge solutions from `output-rsX.jsonl` files.
-If you ran the benchmark with a single generation (e.g. using `math` or `math:0`) then
-use `--input_file=/workspace/test-eval/eval-results/math/output.jsonl` instead of `--input_dir` and `--num_random_seeds` arguments.
+This will run the judge pipeline on the data inside `eval-results/hendrycks_math` folder and judge solutions from `output-rsX.jsonl` files.
+If you ran the benchmark with a single generation (e.g. using `hendrycks_math` or `hendrycks_math:0`) then
+use `--input_file=/workspace/test-eval/eval-results/hendrycks_math/output.jsonl` instead of `--input_dir` and `--num_random_seeds` arguments.
 
 In this example we use gpt-4o from OpenAI, but you can use Llama-405B (that you can host on cluster yourself) or any
 other models. If you have multiple benchmarks, you would need to run the command multiple times.
@@ -52,7 +52,7 @@ evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct
 pass@1          | 1319        | 95.00            | 95.75         | 95.00        | 95.75       | 0.00
 
 
--------------------------------------------------- math -------------------------------------------------
+-------------------------------------------- hendrycks_math ---------------------------------------------
 evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
 pass@1          | 5000        | 67.32            | 67.88         | 67.02        | 68.18       | 2.64
 
diff --git a/docs/pipelines/training.md b/docs/pipelines/training.md
index 2eb0991112..020761051b 100644
--- a/docs/pipelines/training.md
+++ b/docs/pipelines/training.md
@@ -129,7 +129,7 @@ eval(
     model=f"{output_dir}/final_hf_model",
     server_type="trtllm",
     output_dir=f"{output_dir}/results/",
-    benchmarks="gsm8k,math",
+    benchmarks="gsm8k,hendrycks_math",
     server_gpus=8,
     run_after=expname,
 )
diff --git a/docs/releases/opencodereasoning/evaluation.md b/docs/releases/opencodereasoning/evaluation.md
index 2f4b97b143..39bdc21807 100644
--- a/docs/releases/opencodereasoning/evaluation.md
+++ b/docs/releases/opencodereasoning/evaluation.md
@@ -23,6 +23,7 @@ ns eval \
     --benchmarks=livecodebench:8 \
     --split=test_v6_2408_2505 \
     --server_gpus=1 \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=64000
 ```
 
diff --git a/docs/releases/openmathinstruct2/dataset.md b/docs/releases/openmathinstruct2/dataset.md
index b830310b66..e6aa04fba6 100644
--- a/docs/releases/openmathinstruct2/dataset.md
+++ b/docs/releases/openmathinstruct2/dataset.md
@@ -29,8 +29,8 @@ ns generate \
     --server_nodes=2 \
     --num_random_seeds=512 \
     --output_dir=/workspace/solution-augmentation/math \
-    --eval_args="++eval_type=math" \
-    --input_file=/nemo_run/code/nemo_skills/dataset/math/train.jsonl \
+    --input_file=/nemo_run/code/nemo_skills/dataset/hendrycks_math/train.jsonl \
+    ++eval_type=math \
     ++prompt_config=generic/math-base \
     ++examples_type=math_text_detailed \
     ++inference.endpoint_type=text \
@@ -49,8 +49,8 @@ ns generate \
     --server_nodes=2 \
     --num_random_seeds=64 \
     --output_dir=/workspace/solution-augmentation/gsm8k \
-    --eval_args="++eval_type=math" \
     --input_file=/nemo_run/code/nemo_skills/dataset/gsm8k/train.jsonl \
+    ++eval_type=math \
     ++prompt_config=generic/math-base \
     ++examples_type=gsm8k_text_detailed \
     ++inference.endpoint_type=text \
@@ -72,7 +72,7 @@ ns generate \
     --server_nodes=2 \
     --num_random_seeds=80 \
     --output_dir=/workspace/problem-augmentation/math \
-    --input_file=/nemo_run/code/nemo_skills/dataset/math/train.jsonl \
+    --input_file=/nemo_run/code/nemo_skills/dataset/hendrycks_math/train.jsonl \
     ++prompt_config=generic/problem-augmentation \
     ++examples_type=math_problem_augmentation \
     ++generation_key=problem \
@@ -221,7 +221,7 @@ Retrieve top-5 similar items from the test sets
 from nemo_skills.pipeline.cli import wrap_arguments, run_cmd
 
 
-test_sets = ['gsm8k', 'math', 'amc23', 'aime24']
+test_sets = ['gsm8k', 'hendrycks_math', 'amc23', 'aime24']
 retrieve_from = ",".join(f"/nemo_run/code/nemo_skills/dataset/{test_set}/test.jsonl" for test_set in test_sets)
 
 cmd = (
diff --git a/docs/releases/openmathinstruct2/evaluation.md b/docs/releases/openmathinstruct2/evaluation.md
index 7b516fd539..17687f6955 100644
--- a/docs/releases/openmathinstruct2/evaluation.md
+++ b/docs/releases/openmathinstruct2/evaluation.md
@@ -10,7 +10,7 @@ if running on slurm or using different paths.
 ## Prepare evaluation data
 
 ```bash
-ns prepare_data gsm8k math amc23 aime24 omni-math
+ns prepare_data gsm8k hendrycks_math amc23 aime24 omni-math
 ```
 
 ## Run greedy decoding
@@ -21,7 +21,7 @@ ns eval \
     --model=nvidia/OpenMath2-Llama3.1-8B \
     --server_type=trtllm \
     --output_dir=/workspace/openmath2-llama3.1-8b-eval \
-    --benchmarks=aime24,amc23,math,gsm8k,omni-math \
+    --benchmarks=aime24,amc23,hendrycks_math,gsm8k,omni-math \
     --server_gpus=1 \
     --num_jobs=1 \
     ++inference.tokens_to_generate=4096
@@ -36,7 +36,7 @@ accurate numbers than symbolic comparison. You need to define `OPENAI_API_KEY` f
 the command below to work.
 
 ```bash
-for dataset in aime24 amc23 math gsm8k omni-math; do
+for dataset in aime24 amc23 hendrycks_math gsm8k omni-math; do
     ns generate \
         --generation_type=math_judge \
         --cluster=local \
@@ -72,7 +72,7 @@ evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct
 pass@1          | 4428        | 18.97            | 22.22         | 18.11        | 23.08       | 2.55
 
 
--------------------------------------------------- math -------------------------------------------------
+--------------------------------------------- hendrycks_math --------------------------------------------
 evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
 pass@1          | 5000        | 67.70            | 68.10         | 67.50        | 68.30       | 1.36
 
@@ -92,7 +92,7 @@ ns eval \
     --model=nvidia/OpenMath2-Llama3.1-8B \
     --server_type=trtllm \
     --output_dir=/workspace/openmath2-llama3.1-8b-eval \
-    --benchmarks=aime24:256,amc23:256,math:256,gsm8k:256,omni-math:256 \
+    --benchmarks=aime24:256,amc23:256,hendrycks_math:256,gsm8k:256,omni-math:256 \
     --server_gpus=1 \
     --num_jobs=1 \
     ++inference.tokens_to_generate=4096
@@ -100,14 +100,14 @@ ns eval \
 
 This will take a very long time unless you run on slurm cluster. After the generation is done, you will be able
 to see symbolic scores right away. You can evaluate with the judge by first creating new files with majority
-answers. E.g. for "math" benchmark run
+answers. E.g. for "hendrycks_math" benchmark run
 
 ```bash
 python -m nemo_skills.evaluation.aggregate_answers \
-    ++input_dir="./openmath2-llama3.1-8b-eval/eval-results/math" \
+    ++input_dir="./openmath2-llama3.1-8b-eval/eval-results/hendrycks_math" \
     ++input_files="output-rs*.jsonl" \
     ++mode=extract \
-    ++output_dir="./openmath2-llama3.1-8b-eval/eval-results-majority/math"
+    ++output_dir="./openmath2-llama3.1-8b-eval/eval-results-majority/hendrycks_math"
 ```
 
 This will output "./openmath2-llama3.1-8b-eval/eval-results-majority/math/output-agg.jsonl" file with majority answer. We can run the llm-judge pipeline on it.
@@ -117,7 +117,7 @@ Repeat the above steps for all benchmarks. Now we are ready to run the judge pip
 after it is finished. You need to define `OPENAI_API_KEY` for the command below to work.
 
 ```bash
-for dataset in aime24 amc23 math gsm8k omni-math; do
+for dataset in aime24 amc23 hendrycks_math gsm8k omni-math; do
     ns generate \
         --generation_type=math_judge \
         --cluster=local \
diff --git a/docs/releases/openmathreasoning/evaluation.md b/docs/releases/openmathreasoning/evaluation.md
index 9e6ffcbe25..6353fb9d14 100644
--- a/docs/releases/openmathreasoning/evaluation.md
+++ b/docs/releases/openmathreasoning/evaluation.md
@@ -36,6 +36,7 @@ ns eval \
     --server_gpus=1 \
     --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot \
     --benchmarks=comp-math-24-25:64 \
+    ++parse_reasoning=True \
     ++prompt_config=generic/math \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6
@@ -53,6 +54,7 @@ ns eval \
     --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot \
     --benchmarks=hle:64 \
     --split=math \
+    ++parse_reasoning=True \
     ++prompt_config=generic/math \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6 \
@@ -102,6 +104,7 @@ ns eval \
     --server_gpus=1 \
     --num_jobs=1 \
     --with_sandbox \
+    ++parse_reasoning=True \
     ++code_tags=openmath \
     ++prompt_config=openmath/tir \
     ++inference.endpoint_type=text \
@@ -125,6 +128,7 @@ ns eval \
     --server_gpus=1 \
     --num_jobs=1 \
     --with_sandbox \
+    ++parse_reasoning=True \
     ++code_tags=openmath \
     ++prompt_config=generic/math \
     ++inference.endpoint_type=text \
diff --git a/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md b/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md
index c140a2e9e7..536b97a251 100644
--- a/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md
+++ b/docs/tutorials/posts/llama-nemotron-super-v1.5-evals.md
@@ -103,10 +103,10 @@ ns eval \
     --output_dir=/workspace/llama_nemotron_49b_1_5/ \
     --benchmarks=scicode:16,math-500:16,aime24:16,aime25:16 \
     --server_gpus=2 \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++system_message=''
+    ++inference.top_p=0.95
 ```
 
 For GPQA and MMLU-Pro, we additionally specify the exact prompt on which we evaluate the benchmark:
@@ -118,11 +118,11 @@ ns eval \
     --output_dir=/workspace/llama_nemotron_49b_1_5/ \
     --benchmarks=mmlu-pro:16 \
     --server_gpus=2 \
+    ++parse_reasoning=True \
     ++prompt_config=eval/aai/mcq-10choices-boxed \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++system_message=''
+    ++inference.top_p=0.95
 
 ns eval \
     --cluster=local \
@@ -131,11 +131,11 @@ ns eval \
     --output_dir=/workspace/llama_nemotron_49b_1_5/ \
     --benchmarks=gpqa:16 \
     --server_gpus=2 \
+    ++parse_reasoning=True \
     ++prompt_config=eval/aai/mcq-4choices-boxed \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++system_message=''
+    ++inference.top_p=0.95
 ```
 
 For LiveCodeBench, we additionally specify the exact split on which we evaluate the benchmark. In the following command, we evaluate the model on the 166 problems from the 1 October 2024 to 1 March 2025 subset from release_v5. To evaluate on the Artificial Analysis Index (AAI) split, set split to `test_v5_2407_2412`:
@@ -149,10 +149,10 @@ ns eval \
     --benchmarks=livecodebench:16 \
     --split=test_v5_2410_2502 \
     --server_gpus=2 \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++system_message=''
+    ++inference.top_p=0.95
 ```
 
 #### Command for HLE Eval (Reasoning on)
@@ -173,10 +173,10 @@ ns eval \
     --server_gpus=2 \
     --judge_model="o3-mini-20250131" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++system_message=''
+    ++inference.top_p=0.95
 ```
 
 !!! note
@@ -197,10 +197,10 @@ ns eval \
     --server_gpus=2 \
     --server_type=vllm \
     --output_dir=/workspace/llama_nemotron_49b_1_5_tool_calling/ \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=65536 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
-    ++system_message='' \
     ++use_client_parsing=False \
     --server_args="--tool-parser-plugin \"/workspace/Llama-3_3-Nemotron-Super-49B-v1_5/llama_nemotron_toolcall_parser_no_streaming.py\" \
                     --tool-call-parser \"llama_nemotron_json\" \
@@ -211,7 +211,8 @@ ns eval \
 
 For RULER we need to use the same `data_dir` in the evaluation command as we used in the data preparation. We also
 need to use the data preparation `setup` as part of the benchmark name. Finally it's important not to specify
-`++inference.tokens_to_generate` as RULER has a fixed value of this parameter for each task.
+`++inference.tokens_to_generate` as well as not specify `++parse_reasoning=True` as
+RULER has predefined setup for those parameters.
 
 ```bash hl_lines="6-7"
 ns eval \
@@ -223,8 +224,7 @@ ns eval \
     --data_dir=/workspace/ns-data \
     --server_gpus=2 \
     ++inference.temperature=0.6 \
-    ++inference.top_p=0.95 \
-    ++system_message=''
+    ++inference.top_p=0.95
 ```
 
 ### Reasoning-on Results
diff --git a/docs/tutorials/posts/nemotron-nano-v2-evals.md b/docs/tutorials/posts/nemotron-nano-v2-evals.md
index 2d9f6374ae..45fa9bb1c7 100644
--- a/docs/tutorials/posts/nemotron-nano-v2-evals.md
+++ b/docs/tutorials/posts/nemotron-nano-v2-evals.md
@@ -121,6 +121,7 @@ ns eval \
     --server_type=vllm \
     --server_gpus=1 \
     --server_args="--mamba_ssm_cache_dtype float32 " \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
@@ -138,6 +139,7 @@ ns eval \
     --server_type=vllm \
     --server_gpus=1 \
     --server_args="--mamba_ssm_cache_dtype float32 " \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
@@ -156,6 +158,7 @@ ns eval \
     --server_type=vllm \
     --server_gpus=1 \
     --server_args="--mamba_ssm_cache_dtype float32 " \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
@@ -175,6 +178,7 @@ ns eval \
     --server_type=vllm \
     --server_gpus=1 \
     --server_args="--mamba_ssm_cache_dtype float32 " \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
@@ -200,6 +204,7 @@ ns eval \
     --server_args="--mamba_ssm_cache_dtype float32 " \
     --judge_model="o3-mini-20250131" \
     --extra_judge_args="++inference.tokens_to_generate=4096 ++max_concurrent_requests=8" \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
@@ -228,6 +233,7 @@ ns eval \
                    --tool-parser-plugin \"/workspace/NVIDIA-Nemotron-Nano-9B-v2/nemotron_toolcall_parser_no_streaming.py\" \
                    --tool-call-parser \"nemotron_json\" \
                    --enable-auto-tool-choice" \
+    ++parse_reasoning=True \
     ++use_client_parsing=False \
     ++inference.tokens_to_generate=32768 \
     ++inference.temperature=0.6 \
diff --git a/docs/tutorials/posts/omr-simple-recipe.md b/docs/tutorials/posts/omr-simple-recipe.md
index e755e304c4..aaaba980fa 100644
--- a/docs/tutorials/posts/omr-simple-recipe.md
+++ b/docs/tutorials/posts/omr-simple-recipe.md
@@ -234,6 +234,7 @@ ns eval \
     --benchmarks=aime24:8,aime25:8 \
     --output_dir=/workspace/evals/after-training \
     --num_jobs=1 \
+    ++parse_reasoning=True \
     ++inference.tokens_to_generate=16384
 # summarize results, after the evaluation job is done
 ns summarize_results --cluster=local /workspace/evals/after-training --wandb_name=after-training-evals
diff --git a/nemo_skills/code_execution/sandbox.py b/nemo_skills/code_execution/sandbox.py
index f216a7b1c6..6dcfa15606 100644
--- a/nemo_skills/code_execution/sandbox.py
+++ b/nemo_skills/code_execution/sandbox.py
@@ -14,34 +14,24 @@
 
 import abc
 import asyncio
-import glob
 import json
 import logging
 import os
 import traceback
 import uuid
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import httpx
-import tqdm
 
 from nemo_skills.code_execution.proof_utils import (
-    ProofBuildConfig,
     determine_proof_status,
-    prepare_predicted_proof_from_line_dict,
 )
 from nemo_skills.utils import get_logger_name, python_doc_to_cmd_help
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
-def unroll_files(input_files):
-    for manifest_pattern in input_files:
-        for manifest in sorted(glob.glob(manifest_pattern, recursive=True)):
-            yield manifest
-
-
 class Sandbox(abc.ABC):
     """Code execution sandbox.
 
@@ -263,76 +253,6 @@ async def is_proof_correct(self, pred_output, timeout=30.0):
             return "timeout"
         return determine_proof_status(output)
 
-    async def batch_evaluate_results(
-        self,
-        input_files: List[str],
-        num_parallel_requests=10,
-        timeout=30.0,
-        answer_format="lean4-proof",
-        use_predicted_proof_key: bool = False,
-        final_answer_key: str = "**FINAL ANSWER**",
-        restate_formal_statement: bool = True,
-        strip_theorem_from_proof: bool = True,
-        extract_code_mode: str = "last",
-    ):
-        """Evaluate results and write back to original files."""
-
-        semaphore = asyncio.Semaphore(num_parallel_requests)
-
-        async def process_line(line_data):
-            """Process a single line and return updated line data."""
-            if not line_data or not line_data.strip():
-                return line_data
-
-            line_dict = json.loads(line_data)
-            if not line_dict:
-                return line_data
-
-            # Prepare predicted_proof using shared utility
-            config = ProofBuildConfig(
-                final_answer_key=final_answer_key,
-                extract_code_mode=extract_code_mode,
-                restate_formal_statement=restate_formal_statement,
-                strip_theorem_from_proof=strip_theorem_from_proof,
-            )
-
-            line_dict["predicted_proof"] = prepare_predicted_proof_from_line_dict(
-                line_dict=line_dict,
-                config=config,
-                answer_format=answer_format,
-                use_predicted_proof_key=use_predicted_proof_key,
-            )
-
-            # Evaluate proof with concurrency control
-            async with semaphore:
-                proof_status = await self.is_proof_correct(line_dict["predicted_proof"], timeout=timeout)
-                line_dict["proof_status"] = proof_status
-
-            return json.dumps(line_dict)
-
-        # Process each file
-        for input_file in unroll_files(input_files):
-            # Read all lines
-            with open(input_file, "rt", encoding="utf-8") as f:
-                lines = f.readlines()
-
-            # Process lines concurrently with progress bar
-            print(f"Processing {input_file}...")
-            processed_lines = []
-            tasks = [asyncio.create_task(process_line(line.rstrip("\n"))) for line in lines]
-            processed_lines = []
-            for coro in tqdm.tqdm(tasks, total=len(tasks)):
-                processed_lines.append(await coro)
-
-            # Write to temp file then replace original
-            temp_file = input_file + "-tmp"
-            with open(temp_file, "wt", encoding="utf-8") as f:
-                for line in processed_lines:
-                    f.write(line + "\n")
-
-            # Replace original with temp file
-            os.replace(temp_file, input_file)
-
 
 class LocalSandbox(Sandbox):
     """Locally hosted sandbox."""
diff --git a/nemo_skills/dataset/aai/__init__.py b/nemo_skills/dataset/aai/__init__.py
index 8e35184ecc..bbb6347a57 100644
--- a/nemo_skills/dataset/aai/__init__.py
+++ b/nemo_skills/dataset/aai/__init__.py
@@ -27,7 +27,7 @@
         # can add "NUM_CHUNKS": N to parallelize
     },
     "hle": {
-        "GENERATION_ARGS": "++remove_thinking=True ++inference.temperature=0.0",
+        "GENERATION_ARGS": "++inference.temperature=0.0",
         "JUDGE_ARGS": "++prompt_config=judge/hle ++generation_key=judgement",
     },
     # Science benchmarks
diff --git a/nemo_skills/dataset/aalcr/__init__.py b/nemo_skills/dataset/aalcr/__init__.py
index 95c7206dd7..4a4a645e3f 100644
--- a/nemo_skills/dataset/aalcr/__init__.py
+++ b/nemo_skills/dataset/aalcr/__init__.py
@@ -16,8 +16,8 @@
 PROMPT_CONFIG = "generic/default"
 DATASET_GROUP = "long-context"
 METRICS_TYPE = "aalcr"
-EVAL_ARGS = "++eval_type=no-op"  # using judgement directly in metrics, no need for special evaluation
-GENERATION_ARGS = ""
+# using judgement directly in metrics, no need for special evaluation
+GENERATION_ARGS = "++prompt_config=generic/default"
 
 JUDGE_PIPELINE_ARGS = {
     "model": "gpt-4.1",
diff --git a/nemo_skills/dataset/aalcr/prepare.py b/nemo_skills/dataset/aalcr/prepare.py
index 35479ac535..856d734639 100644
--- a/nemo_skills/dataset/aalcr/prepare.py
+++ b/nemo_skills/dataset/aalcr/prepare.py
@@ -255,4 +255,4 @@ def prepare_aalcr_data(max_context_window, setup, tokenizer_name):
 
     LOG.info(f"Preparing AA-LCR dataset with additional arguments: {args}")
     prepare_aalcr_data(args.max_context_window, args.setup, args.tokenizer_name)
-    LOG.info(f"AA-LCR dataset preparation with setup {args.setup} completed. Use --split=${args.setup} to evaluate!")
+    LOG.info(f"AA-LCR dataset preparation with setup {args.setup} completed. Use --split={args.setup} to evaluate!")
diff --git a/nemo_skills/dataset/aime24/__init__.py b/nemo_skills/dataset/aime24/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/aime24/__init__.py
+++ b/nemo_skills/dataset/aime24/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/aime25/__init__.py b/nemo_skills/dataset/aime25/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/aime25/__init__.py
+++ b/nemo_skills/dataset/aime25/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/algebra222/__init__.py b/nemo_skills/dataset/algebra222/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/algebra222/__init__.py
+++ b/nemo_skills/dataset/algebra222/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/amc23/__init__.py b/nemo_skills/dataset/amc23/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/amc23/__init__.py
+++ b/nemo_skills/dataset/amc23/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/answer-judge/__init__.py b/nemo_skills/dataset/answer-judge/__init__.py
index cfb947e615..6f5ce68817 100644
--- a/nemo_skills/dataset/answer-judge/__init__.py
+++ b/nemo_skills/dataset/answer-judge/__init__.py
@@ -16,5 +16,4 @@
 DATASET_GROUP = "math"
 METRICS_TYPE = "answer-judgement"
 # using judgement directly in metrics, no need for special evaluation
-EVAL_ARGS = "++eval_type=no-op ++generation_key=judgement"
 GENERATION_ARGS = "++prompt_config=judge/math ++generation_key=judgement"
diff --git a/nemo_skills/dataset/arena-hard/__init__.py b/nemo_skills/dataset/arena-hard/__init__.py
index 00ea69f6a2..e21faaae89 100644
--- a/nemo_skills/dataset/arena-hard/__init__.py
+++ b/nemo_skills/dataset/arena-hard/__init__.py
@@ -16,7 +16,7 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "chat"
 METRICS_TYPE = "arena"
-EVAL_ARGS = "++eval_type=no-op"  # using judgement directly in metrics, no need for special evaluation
+# using judgement directly in metrics, no need for special evaluation
 GENERATION_ARGS = "++prompt_config=generic/default"
 
 JUDGE_PIPELINE_ARGS = {
diff --git a/nemo_skills/dataset/asdiv/__init__.py b/nemo_skills/dataset/asdiv/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/asdiv/__init__.py
+++ b/nemo_skills/dataset/asdiv/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/beyond-aime/__init__.py b/nemo_skills/dataset/beyond-aime/__init__.py
index 6116e76ece..af7fdd7412 100644
--- a/nemo_skills/dataset/beyond-aime/__init__.py
+++ b/nemo_skills/dataset/beyond-aime/__init__.py
@@ -17,5 +17,4 @@
 
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/bfcl_v3/prepare.py b/nemo_skills/dataset/bfcl_v3/prepare.py
index 3ee9cc4e71..990179e6cf 100644
--- a/nemo_skills/dataset/bfcl_v3/prepare.py
+++ b/nemo_skills/dataset/bfcl_v3/prepare.py
@@ -45,8 +45,7 @@
 DEFAULT_SETTINGS = """
 DATASET_GROUP = "tool"
 METRICS_TYPE = "bfcl"
-EVAL_ARGS = "++eval_type=bfcl"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++eval_type=bfcl"
 GENERATION_MODULE = "nemo_skills.inference.eval.bfcl"
 """
 
diff --git a/nemo_skills/dataset/bigcodebench/__init__.py b/nemo_skills/dataset/bigcodebench/__init__.py
index c5b06cd0ac..394ad2703a 100644
--- a/nemo_skills/dataset/bigcodebench/__init__.py
+++ b/nemo_skills/dataset/bigcodebench/__init__.py
@@ -16,5 +16,4 @@
 DATASET_GROUP = "code"
 METRICS_TYPE = "bigcodebench"
 EVAL_SPLIT = "full"
-EVAL_ARGS = "++eval_type=bigcodebench"
-GENERATION_ARGS = "++prompt_config=eval/bigcodebench/codegen"
+GENERATION_ARGS = "++prompt_config=eval/bigcodebench/codegen ++eval_type=bigcodebench"
diff --git a/nemo_skills/dataset/bigcodebench/prepare.py b/nemo_skills/dataset/bigcodebench/prepare.py
index e4f01a1f3b..f9229846cd 100644
--- a/nemo_skills/dataset/bigcodebench/prepare.py
+++ b/nemo_skills/dataset/bigcodebench/prepare.py
@@ -80,7 +80,7 @@ def wrap_in_code_tag(text):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--output_dir", type=str, default=str(Path(__file__).parent))
-    parser.add_argument("--split", type=str, default="hard", choices=["full", "hard"])
+    parser.add_argument("--split", type=str, default="full", choices=["full", "hard"])
 
     args = parser.parse_args()
 
diff --git a/nemo_skills/dataset/brumo25/__init__.py b/nemo_skills/dataset/brumo25/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/brumo25/__init__.py
+++ b/nemo_skills/dataset/brumo25/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/college_math/__init__.py b/nemo_skills/dataset/college_math/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/college_math/__init__.py
+++ b/nemo_skills/dataset/college_math/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/comp-math-24-25/__init__.py b/nemo_skills/dataset/comp-math-24-25/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/comp-math-24-25/__init__.py
+++ b/nemo_skills/dataset/comp-math-24-25/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/flores200/__init__.py b/nemo_skills/dataset/flores200/__init__.py
index 86a7f76717..8ed1e7ad36 100644
--- a/nemo_skills/dataset/flores200/__init__.py
+++ b/nemo_skills/dataset/flores200/__init__.py
@@ -15,8 +15,7 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "multilingual/segment-translation"
 DATASET_GROUP = "chat"
 METRICS_TYPE = "translation"
-EVAL_ARGS = "++eval_type=no-op"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=multilingual/segment-translation"
+EVAL_SPLIT = "devtest"
diff --git a/nemo_skills/dataset/flores200/prepare.py b/nemo_skills/dataset/flores200/prepare.py
index 7a427e0f1f..6055d8d45c 100644
--- a/nemo_skills/dataset/flores200/prepare.py
+++ b/nemo_skills/dataset/flores200/prepare.py
@@ -56,7 +56,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--split", default="dev", choices=("dev", "devtest"), help="Dataset split to process.")
+    parser.add_argument("--split", default="devtest", choices=("dev", "devtest"), help="Dataset split to process.")
     parser.add_argument(
         "--source_languages",
         default=["en", "de", "es", "fr", "it", "ja"],
diff --git a/nemo_skills/dataset/gaokao2023en/__init__.py b/nemo_skills/dataset/gaokao2023en/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/gaokao2023en/__init__.py
+++ b/nemo_skills/dataset/gaokao2023en/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/gpqa/__init__.py b/nemo_skills/dataset/gpqa/__init__.py
index 4e9b6084c8..21c39cfac8 100644
--- a/nemo_skills/dataset/gpqa/__init__.py
+++ b/nemo_skills/dataset/gpqa/__init__.py
@@ -17,6 +17,5 @@
 
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
-EVAL_ARGS = "++eval_type=multichoice"
 EVAL_SPLIT = "diamond"
-GENERATION_ARGS = "++prompt_config=eval/aai/mcq-4choices"
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-4choices ++eval_type=multichoice"
diff --git a/nemo_skills/dataset/gsm-plus/__init__.py b/nemo_skills/dataset/gsm-plus/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/gsm-plus/__init__.py
+++ b/nemo_skills/dataset/gsm-plus/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/gsm8k/__init__.py b/nemo_skills/dataset/gsm8k/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/gsm8k/__init__.py
+++ b/nemo_skills/dataset/gsm8k/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/math/__init__.py b/nemo_skills/dataset/hendrycks_math/__init__.py
similarity index 90%
rename from nemo_skills/dataset/math/__init__.py
rename to nemo_skills/dataset/hendrycks_math/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/math/__init__.py
+++ b/nemo_skills/dataset/hendrycks_math/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/math/fix_ref_solns.py b/nemo_skills/dataset/hendrycks_math/fix_ref_solns.py
similarity index 100%
rename from nemo_skills/dataset/math/fix_ref_solns.py
rename to nemo_skills/dataset/hendrycks_math/fix_ref_solns.py
diff --git a/nemo_skills/dataset/math/prepare.py b/nemo_skills/dataset/hendrycks_math/prepare.py
similarity index 100%
rename from nemo_skills/dataset/math/prepare.py
rename to nemo_skills/dataset/hendrycks_math/prepare.py
diff --git a/nemo_skills/dataset/hle/__init__.py b/nemo_skills/dataset/hle/__init__.py
index 1d841b7dc1..51db80829d 100644
--- a/nemo_skills/dataset/hle/__init__.py
+++ b/nemo_skills/dataset/hle/__init__.py
@@ -15,8 +15,7 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "hle"  # This uses the MathMetrics class, but with compute_no_answer=False
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/hle"
+GENERATION_ARGS = "++prompt_config=generic/hle ++eval_type=math"
 EVAL_SPLIT = "text"
 
 # Some answers are not possible to compare symbolically, so have to use a judge model
diff --git a/nemo_skills/dataset/hmmt_feb25/__init__.py b/nemo_skills/dataset/hmmt_feb25/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/hmmt_feb25/__init__.py
+++ b/nemo_skills/dataset/hmmt_feb25/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/human-eval-infilling/__init__.py b/nemo_skills/dataset/human-eval-infilling/__init__.py
index 1a182c5a5d..ec4a847c21 100644
--- a/nemo_skills/dataset/human-eval-infilling/__init__.py
+++ b/nemo_skills/dataset/human-eval-infilling/__init__.py
@@ -16,5 +16,4 @@
 DATASET_GROUP = "code"
 METRICS_TYPE = "human_eval_infilling"
 EVAL_SPLIT = "random_span"
-EVAL_ARGS = "++eval_type=human_eval_infilling"
-GENERATION_ARGS = "++prompt_config=generic/fim"
+GENERATION_ARGS = "++prompt_config=generic/fim ++eval_type=human_eval_infilling"
diff --git a/nemo_skills/dataset/human-eval/__init__.py b/nemo_skills/dataset/human-eval/__init__.py
index 9cb3be8c2d..fa880da4c6 100644
--- a/nemo_skills/dataset/human-eval/__init__.py
+++ b/nemo_skills/dataset/human-eval/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "code"
 METRICS_TYPE = "evalplus"
-EVAL_ARGS = "++eval_type=evalplus ++eval_config.dataset=humaneval"
-GENERATION_ARGS = "++prompt_config=generic/codegen"
+GENERATION_ARGS = "++prompt_config=generic/codegen ++eval_type=evalplus ++eval_config.evalplus.dataset=humaneval"
diff --git a/nemo_skills/dataset/ifbench/__init__.py b/nemo_skills/dataset/ifbench/__init__.py
index 9839ac800f..ea8a854ad7 100644
--- a/nemo_skills/dataset/ifbench/__init__.py
+++ b/nemo_skills/dataset/ifbench/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "chat"
 METRICS_TYPE = "if"
-EVAL_ARGS = "++eval_type=ifbench ++generation_key=response"
-GENERATION_ARGS = "++generation_key=response ++prompt_config=generic/default"
+GENERATION_ARGS = "++generation_key=response ++prompt_config=generic/default ++eval_type=ifbench"
diff --git a/nemo_skills/dataset/ifeval/__init__.py b/nemo_skills/dataset/ifeval/__init__.py
index d0c65339a8..40e7f12baf 100644
--- a/nemo_skills/dataset/ifeval/__init__.py
+++ b/nemo_skills/dataset/ifeval/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "chat"
 METRICS_TYPE = "if"
-EVAL_ARGS = "++eval_type=if ++generation_key=response"
-GENERATION_ARGS = "++prompt_config=generic/default ++generation_key=response"
+GENERATION_ARGS = "++prompt_config=generic/default ++generation_key=response ++eval_type=if"
diff --git a/nemo_skills/dataset/ioi24/__init__.py b/nemo_skills/dataset/ioi24/__init__.py
index f3121341a7..bbe0910186 100644
--- a/nemo_skills/dataset/ioi24/__init__.py
+++ b/nemo_skills/dataset/ioi24/__init__.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-GENERATION_ARGS = "++prompt_config=generic/default"
+GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=ioi"
 DATASET_GROUP = "code"
 METRICS_TYPE = "ioi"
-EVAL_ARGS = "++eval_type=ioi"
 
 # environment variables required by this benchmark
 SANDBOX_ENV_VARS = [
diff --git a/nemo_skills/dataset/ioi25/__init__.py b/nemo_skills/dataset/ioi25/__init__.py
index b4311554a9..3032b16653 100644
--- a/nemo_skills/dataset/ioi25/__init__.py
+++ b/nemo_skills/dataset/ioi25/__init__.py
@@ -17,10 +17,9 @@
 """
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
-GENERATION_ARGS = "++prompt_config=generic/default"
+GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=ioi"
 DATASET_GROUP = "code"
 METRICS_TYPE = "ioi"
-EVAL_ARGS = "++eval_type=ioi"
 
 # environment variables required by this benchmark
 SANDBOX_ENV_VARS = [
diff --git a/nemo_skills/dataset/livebench-coding/__init__.py b/nemo_skills/dataset/livebench-coding/__init__.py
index fc8fd66822..523301c188 100644
--- a/nemo_skills/dataset/livebench-coding/__init__.py
+++ b/nemo_skills/dataset/livebench-coding/__init__.py
@@ -16,5 +16,4 @@
 DATASET_GROUP = "code"
 METRICS_TYPE = "livecodebench"
 EVAL_SPLIT = "test"
-EVAL_ARGS = "++eval_type=livebench_coding"
-GENERATION_ARGS = "++prompt_config=generic/default"
+GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=livebench_coding"
diff --git a/nemo_skills/dataset/livecodebench-cpp/__init__.py b/nemo_skills/dataset/livecodebench-cpp/__init__.py
index bc4565da2d..76d7c478de 100644
--- a/nemo_skills/dataset/livecodebench-cpp/__init__.py
+++ b/nemo_skills/dataset/livecodebench-cpp/__init__.py
@@ -16,5 +16,4 @@
 DATASET_GROUP = "code"
 METRICS_TYPE = "livecodebench"
 EVAL_SPLIT = "v6_2408_2505"
-EVAL_ARGS = "++eval_type=livecodebench ++eval_config.language=cpp"
-GENERATION_ARGS = "++prompt_config=eval/livecodebench/cpp_codegen"
+GENERATION_ARGS = "++prompt_config=eval/livecodebench/cpp_codegen ++eval_type=livecodebench ++eval_config.language=cpp"
diff --git a/nemo_skills/dataset/livecodebench-pro/__init__.py b/nemo_skills/dataset/livecodebench-pro/__init__.py
index 2252582433..a071794767 100644
--- a/nemo_skills/dataset/livecodebench-pro/__init__.py
+++ b/nemo_skills/dataset/livecodebench-pro/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "code"
 METRICS_TYPE = "code"
-EVAL_ARGS = "++eval_type=livecodebench_pro"
-GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen"
+GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen ++eval_type=livecodebench_pro"
diff --git a/nemo_skills/dataset/livecodebench/__init__.py b/nemo_skills/dataset/livecodebench/__init__.py
index f2d32be016..6945c0c5e0 100644
--- a/nemo_skills/dataset/livecodebench/__init__.py
+++ b/nemo_skills/dataset/livecodebench/__init__.py
@@ -16,5 +16,4 @@
 DATASET_GROUP = "code"
 METRICS_TYPE = "livecodebench"
 EVAL_SPLIT = "test_v6_2408_2505"
-EVAL_ARGS = "++eval_type=livecodebench"
-GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen"
+GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen ++eval_type=livecodebench"
diff --git a/nemo_skills/dataset/math-500/__init__.py b/nemo_skills/dataset/math-500/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/math-500/__init__.py
+++ b/nemo_skills/dataset/math-500/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/math-odyssey/__init__.py b/nemo_skills/dataset/math-odyssey/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/math-odyssey/__init__.py
+++ b/nemo_skills/dataset/math-odyssey/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/mawps/__init__.py b/nemo_skills/dataset/mawps/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/mawps/__init__.py
+++ b/nemo_skills/dataset/mawps/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/mbpp/__init__.py b/nemo_skills/dataset/mbpp/__init__.py
index f21e877ba1..78caff46cd 100644
--- a/nemo_skills/dataset/mbpp/__init__.py
+++ b/nemo_skills/dataset/mbpp/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "code"
 METRICS_TYPE = "evalplus"
-EVAL_ARGS = "++eval_type=evalplus ++eval_config.dataset=mbpp"
-GENERATION_ARGS = "++prompt_config=generic/codegen"
+GENERATION_ARGS = "++prompt_config=generic/codegen ++eval_type=evalplus ++eval_config.evalplus.dataset=mbpp"
diff --git a/nemo_skills/dataset/minerva_math/__init__.py b/nemo_skills/dataset/minerva_math/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/minerva_math/__init__.py
+++ b/nemo_skills/dataset/minerva_math/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/minif2f/__init__.py b/nemo_skills/dataset/minif2f/__init__.py
index e376ee01a4..09c363b881 100644
--- a/nemo_skills/dataset/minif2f/__init__.py
+++ b/nemo_skills/dataset/minif2f/__init__.py
@@ -16,6 +16,5 @@
 # Default evaluation and generation settings for the minif2f dataset
 DATASET_GROUP = "lean4"
 METRICS_TYPE = "lean4-proof"
-EVAL_ARGS = "++eval_type=lean4-proof"
-GENERATION_ARGS = "++prompt_config=lean4/formal-proof-deepseek-prover-v2"
+GENERATION_ARGS = "++prompt_config=lean4/formal-proof-deepseek-prover-v2 ++eval_type=lean4-proof"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/mmlu-pro/__init__.py b/nemo_skills/dataset/mmlu-pro/__init__.py
index ba6ce4e3b9..233f830b5a 100644
--- a/nemo_skills/dataset/mmlu-pro/__init__.py
+++ b/nemo_skills/dataset/mmlu-pro/__init__.py
@@ -17,5 +17,4 @@
 
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
-EVAL_ARGS = "++eval_type=multichoice"
-GENERATION_ARGS = "++prompt_config=eval/aai/mcq-10choices"
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-10choices ++eval_type=multichoice"
diff --git a/nemo_skills/dataset/mmlu-prox/__init__.py b/nemo_skills/dataset/mmlu-prox/__init__.py
index e0f954cf0d..996d8be6f9 100644
--- a/nemo_skills/dataset/mmlu-prox/__init__.py
+++ b/nemo_skills/dataset/mmlu-prox/__init__.py
@@ -15,8 +15,6 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "generic/default"
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
-EVAL_ARGS = "++eval_type=multichoice"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=multichoice"
diff --git a/nemo_skills/dataset/mmlu-redux/__init__.py b/nemo_skills/dataset/mmlu-redux/__init__.py
index 20ec7704ae..0631833628 100644
--- a/nemo_skills/dataset/mmlu-redux/__init__.py
+++ b/nemo_skills/dataset/mmlu-redux/__init__.py
@@ -16,6 +16,4 @@
 
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
-EVAL_ARGS = "++eval_type=multichoice"
-
-GENERATION_ARGS = "++prompt_config=generic/general-boxed"
+GENERATION_ARGS = "++prompt_config=generic/general-boxed ++eval_type=multichoice"
diff --git a/nemo_skills/dataset/mmlu/__init__.py b/nemo_skills/dataset/mmlu/__init__.py
index adcd545590..dad531a56d 100644
--- a/nemo_skills/dataset/mmlu/__init__.py
+++ b/nemo_skills/dataset/mmlu/__init__.py
@@ -16,5 +16,4 @@
 
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
-EVAL_ARGS = "++eval_type=multichoice"
-GENERATION_ARGS = "++prompt_config=eval/aai/mcq-4choices-boxed"
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-4choices-boxed ++eval_type=multichoice"
diff --git a/nemo_skills/dataset/mobench/__init__.py b/nemo_skills/dataset/mobench/__init__.py
index d99b09b2cd..6761a0ce5f 100644
--- a/nemo_skills/dataset/mobench/__init__.py
+++ b/nemo_skills/dataset/mobench/__init__.py
@@ -15,6 +15,5 @@
 # Default evaluation and generation settings for Lean4 proof benchmarks
 DATASET_GROUP = "lean4"
 METRICS_TYPE = "lean4-proof"
-EVAL_ARGS = "++eval_type=lean4-proof"
-GENERATION_ARGS = "++prompt_config=lean4/formal-proof-deepseek-prover-v2"
+GENERATION_ARGS = "++prompt_config=lean4/formal-proof-deepseek-prover-v2 ++eval_type=lean4-proof"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/mrcr/__init__.py b/nemo_skills/dataset/mrcr/__init__.py
index 4581020086..c0416bd254 100644
--- a/nemo_skills/dataset/mrcr/__init__.py
+++ b/nemo_skills/dataset/mrcr/__init__.py
@@ -14,5 +14,4 @@
 EVAL_SPLIT = "all"
 DATASET_GROUP = "long-context"
 METRICS_TYPE = "mrcr"
-EVAL_ARGS = "++eval_type=mrcr"
-GENERATION_ARGS = "++prompt_format=openai"
+GENERATION_ARGS = "++prompt_format=openai ++eval_type=mrcr"
diff --git a/nemo_skills/dataset/mrcr/prepare.py b/nemo_skills/dataset/mrcr/prepare.py
index f5a8a08ad0..52c5cd82cd 100644
--- a/nemo_skills/dataset/mrcr/prepare.py
+++ b/nemo_skills/dataset/mrcr/prepare.py
@@ -98,4 +98,4 @@ def get_mrcr_data(needles_subset, setup, max_context_window):
 
     print(f"Preparing MRCR dataset with additional arguments: {args}")
     get_mrcr_data(args.needles_subset, args.setup, args.max_context_window)
-    print(f"MRCR dataset preparation with setup {args.setup} completed. Use --split=${args.setup} to evaluate!")
+    print(f"MRCR dataset preparation with setup {args.setup} completed. Use --split={args.setup} to evaluate!")
diff --git a/nemo_skills/dataset/ojbench/__init__.py b/nemo_skills/dataset/ojbench/__init__.py
deleted file mode 100644
index e3f88f6a96..0000000000
--- a/nemo_skills/dataset/ojbench/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# settings that define how evaluation should be done by default (all can be changed from cmdline)
-DATASET_GROUP = "code"
-METRICS_TYPE = "ojbench"
-EVAL_SPLIT = "test_python"
-EVAL_ARGS = "++eval_type=ojbench"
-REQUIRES_SANDBOX = True
-KEEP_MOUNTS_FOR_SANDBOX = True
-GENERATION_ARGS = "++prompt_config=generic/default"
diff --git a/nemo_skills/dataset/ojbench/prepare.py b/nemo_skills/dataset/ojbench/prepare.py
deleted file mode 100644
index aa36e16767..0000000000
--- a/nemo_skills/dataset/ojbench/prepare.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-
-REPO_URL = "https://huggingface.co/datasets/He-Ren/OJBench_testdata"
-HF_TOKEN = os.environ.get("HF_TOKEN")
-if not HF_TOKEN:
-    print("❌ Error: Hugging Face token not found.", file=sys.stderr)
-    print("   Please set the HF_TOKEN environment variable with your access token.", file=sys.stderr)
-    print("   You can create a token at: https://huggingface.co/settings/tokens", file=sys.stderr)
-    sys.exit(1)
-
-
-def clone_dataset_repo(url, destination):
-    if not shutil.which("git"):
-        print("❌ Error: Git executable not found. Please install Git.", file=sys.stderr)
-        sys.exit(1)
-
-    try:
-        if destination.exists() or destination.is_symlink():
-            print(f"Destination '{destination}' already exists. Removing it...")
-            if destination.is_dir():
-                shutil.rmtree(destination)
-            else:
-                destination.unlink()
-
-        auth_url = url.replace("https://huggingface.co/", f"https://user:{HF_TOKEN}@huggingface.co/", 1)
-        print(f"Cloning {url} into {destination}...")
-        subprocess.run(["git", "clone", auth_url, destination], check=True, capture_output=True)
-
-        print("✅ Git clone is successful.")
-
-    except subprocess.CalledProcessError as e:
-        print("❌ Git command failed:", file=sys.stderr)
-        cmd = [url if i == 2 else arg for i, arg in enumerate(e.cmd)]
-        print(f"   Command: {' '.join(map(str, cmd))}", file=sys.stderr)
-        stderr = e.stderr.decode().strip()
-        stderr = stderr.replace(HF_TOKEN, "***") if HF_TOKEN else stderr
-        print(f"   Stderr: {stderr}", file=sys.stderr)
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    data_dir = Path(__file__).absolute().parent
-    data_dir.mkdir(exist_ok=True)
-    destination = data_dir / "OJBench_testdata"
-    clone_dataset_repo(REPO_URL, destination)
-
-    source_file = destination / "prompts" / "full.jsonl"
-    python_target_file = data_dir / "test_python.jsonl"
-    cpp_target_file = data_dir / "test_cpp.jsonl"
-
-    print(f"Processing '{source_file}' and splitting into Python and C++ subsets...")
-    processed_lines = 0
-    try:
-        with (
-            source_file.open("r", encoding="utf-8") as infile,
-            python_target_file.open("w", encoding="utf-8") as outfile_py,
-            cpp_target_file.open("w", encoding="utf-8") as outfile_cpp,
-        ):
-            for line in infile:
-                data = json.loads(line)
-                data["question"] = data.pop("prompt")
-                data["subset_for_metrics"] = data["difficulty"]
-                if data["language"] == "python":
-                    outfile_py.write(json.dumps(data) + "\n")
-                elif data["language"] == "cpp":
-                    outfile_cpp.write(json.dumps(data) + "\n")
-                processed_lines += 1
-        print(f"✅ Successfully processed {processed_lines} lines.")
-
-    except (FileNotFoundError, json.JSONDecodeError, OSError) as e:
-        print(f"❌ Error during file processing: {e}", file=sys.stderr)
-        sys.exit(1)
diff --git a/nemo_skills/dataset/olympiadbench/__init__.py b/nemo_skills/dataset/olympiadbench/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/olympiadbench/__init__.py
+++ b/nemo_skills/dataset/olympiadbench/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/omni-math/__init__.py b/nemo_skills/dataset/omni-math/__init__.py
index fccd9bdcd8..610626a6f1 100644
--- a/nemo_skills/dataset/omni-math/__init__.py
+++ b/nemo_skills/dataset/omni-math/__init__.py
@@ -15,8 +15,7 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
 
 # some answers are not possible to compare symbolically, so have to use a judge model
 # setting openai judge by default, but can be overriden from command line for a locally hosted model
diff --git a/nemo_skills/dataset/prepare.py b/nemo_skills/dataset/prepare.py
index 52f9e031d8..60d41600c9 100755
--- a/nemo_skills/dataset/prepare.py
+++ b/nemo_skills/dataset/prepare.py
@@ -15,12 +15,20 @@
 import importlib
 import subprocess
 import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 
 from nemo_skills.dataset.utils import add_header_to_jsonl_inplace, get_lean4_header
 
 
-def prepare_datasets(datasets=None, dataset_groups=None, add_lean4_header=False, extra_args=""):
+def prepare_datasets(
+    datasets=None,
+    dataset_groups=None,
+    add_lean4_header=False,
+    extra_args="",
+    parallelism=20,
+    retries=3,
+):
     if datasets and dataset_groups:
         raise ValueError("Cannot specify both datasets and dataset_groups")
 
@@ -38,21 +46,52 @@ def prepare_datasets(datasets=None, dataset_groups=None, add_lean4_header=False,
                 target_datasets.append(dataset)
         datasets = target_datasets
 
-    for dataset in datasets:
-        print(f"Preparing {dataset}")
-        dataset_path = datasets_dir / dataset
-        subprocess.run(f"{sys.executable} {dataset_path / 'prepare.py'} {extra_args}", shell=True, check=True)
-        dataset_module = importlib.import_module(f"nemo_skills.dataset.{dataset}")
+    max_workers = max(1, parallelism) if parallelism is not None else 1
 
-        if getattr(dataset_module, "DATASET_GROUP", None) == "math":
-            if add_lean4_header:
-                jsonl_files = list(dataset_path.glob("*.jsonl"))
-                header = get_lean4_header()
-                for jsonl_file in jsonl_files:
-                    print(f"Adding Lean4 header to {jsonl_file}")
-                    add_header_to_jsonl_inplace(jsonl_file, header)
+    def run_prepare(dataset_name):
+        dataset_path = datasets_dir / dataset_name
+        attempts = max(1, retries + 1)
+        for attempt in range(1, attempts + 1):
+            if attempts > 1:
+                print(f"Preparing {dataset_name} (attempt {attempt}/{attempts})")
+            else:
+                print(f"Preparing {dataset_name}")
+            try:
+                subprocess.run(
+                    f"{sys.executable} {dataset_path / 'prepare.py'} {extra_args}",
+                    shell=True,
+                    check=True,
+                )
+                break
+            except subprocess.CalledProcessError:
+                if attempt == attempts:
+                    raise
+                print(f"Retrying {dataset_name} after failure")
 
-    return datasets
+        dataset_module = importlib.import_module(f"nemo_skills.dataset.{dataset_name}")
+        if getattr(dataset_module, "DATASET_GROUP", None) == "math" and add_lean4_header:
+            jsonl_files = list(dataset_path.glob("*.jsonl"))
+            header = get_lean4_header()
+            for jsonl_file in jsonl_files:
+                print(f"Adding Lean4 header to {jsonl_file}")
+                add_header_to_jsonl_inplace(jsonl_file, header)
+        return dataset_name
+
+    errors = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(run_prepare, dataset): dataset for dataset in datasets}
+        for future in as_completed(futures):
+            dataset = futures[future]
+            try:
+                future.result()
+            except Exception as exc:  # noqa: BLE001
+                errors.append((dataset, exc))
+
+    if errors:
+        first_dataset, first_error = errors[0]
+        raise RuntimeError(f"Failed to prepare dataset {first_dataset}") from first_error
+
+    return list(datasets)
 
 
 if __name__ == "__main__":
@@ -68,7 +107,26 @@ def prepare_datasets(datasets=None, dataset_groups=None, add_lean4_header=False,
     parser.add_argument(
         "--add_lean4_header", action="store_true", help="Add Lean4 header to JSONL files during preparation"
     )
+    parser.add_argument(
+        "--parallelism",
+        type=int,
+        default=20,
+        help="Number of datasets to prepare in parallel",
+    )
+    parser.add_argument(
+        "--retries",
+        type=int,
+        default=0,
+        help="Number of retries per dataset if preparation fails",
+    )
     args, unknown = parser.parse_known_args()
     extra_args = " ".join(unknown)
 
-    prepare_datasets(args.datasets, args.dataset_groups, args.add_lean4_header, extra_args=extra_args)
+    prepare_datasets(
+        args.datasets,
+        args.dataset_groups,
+        args.add_lean4_header,
+        extra_args=extra_args,
+        parallelism=args.parallelism,
+        retries=args.retries,
+    )
diff --git a/nemo_skills/dataset/proofnet/__init__.py b/nemo_skills/dataset/proofnet/__init__.py
index 83bd350e15..a0bbf2e891 100644
--- a/nemo_skills/dataset/proofnet/__init__.py
+++ b/nemo_skills/dataset/proofnet/__init__.py
@@ -15,6 +15,5 @@
 # Default evaluation and generation settings for the minif2f dataset
 DATASET_GROUP = "lean4"
 METRICS_TYPE = "lean4-proof"
-EVAL_ARGS = "++eval_type=lean4-proof"
-GENERATION_ARGS = "++prompt_config=lean4/formal-proof"
+GENERATION_ARGS = "++prompt_config=lean4/formal-proof ++eval_type=lean4-proof"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/putnam-bench/__init__.py b/nemo_skills/dataset/putnam-bench/__init__.py
index da44cb2af5..3627077df4 100644
--- a/nemo_skills/dataset/putnam-bench/__init__.py
+++ b/nemo_skills/dataset/putnam-bench/__init__.py
@@ -16,6 +16,5 @@
 # Default evaluation and generation settings for the PutnamBench
 DATASET_GROUP = "lean4"
 METRICS_TYPE = "lean4-proof"
-EVAL_ARGS = "++eval_type=lean4-proof"
-GENERATION_ARGS = "++prompt_config=lean4/formal-proof"
+GENERATION_ARGS = "++prompt_config=lean4/formal-proof ++eval_type=lean4-proof"
 REQUIRES_SANDBOX = True
diff --git a/nemo_skills/dataset/putnam-bench/prepare.py b/nemo_skills/dataset/putnam-bench/prepare.py
index 5e54163ae9..4cb59dc76d 100644
--- a/nemo_skills/dataset/putnam-bench/prepare.py
+++ b/nemo_skills/dataset/putnam-bench/prepare.py
@@ -16,13 +16,11 @@
 import os
 import re
 import shutil
-import urllib.request
+import subprocess
+import tempfile
 from pathlib import Path
 
-import requests
-
-URL_prefix = "https://raw.githubusercontent.com/trishullab/PutnamBench/dc91ed7/lean4/src/"
-URL = "https://github.com/trishullab/PutnamBench/tree/dc91ed7/lean4/src"
+REPO_URL = "https://github.com/trishullab/PutnamBench.git"
 
 
 lean_regex = r"(^\s*theorem\s+([\S]+).+?sorry)"
@@ -60,31 +58,18 @@ def extract_theorem(filename):
     return theorem
 
 
-def get_file_names_from_github(url):
-    response = requests.get(url)
-
-    if response.status_code == 200:
-        # Extract file names using a regular expression
-        # This regex pattern matches the hrefs of the files.
-        # TODO: This is a pretty fragile approach, as it depends on GitHub's current HTML structure.
-        # find all names with putnam*.lean
-        pattern = r'putnam[^"]+\.lean'
-        matches = re.findall(pattern, response.text)
-        return matches
-    else:
-        print(f"Failed to access {url}, Status code: {response.status_code}")
-        return []
-
-
 def download_dataset(output_path):
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    # get all file names with putnam*.lean
-    file_names = get_file_names_from_github(URL)
-    for file_name in file_names:
-        # download the file if not exists
-        if not os.path.exists(os.path.join(output_path, file_name)):
-            urllib.request.urlretrieve(URL_prefix + file_name, os.path.join(output_path, file_name))
+    output_dir = Path(output_path)
+    if output_dir.exists():
+        shutil.rmtree(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        repo_path = Path(tmpdir) / "putnambench"
+        subprocess.run(["git", "clone", "--depth", "1", REPO_URL, str(repo_path)], check=True)
+        src_dir = repo_path / "lean4" / "src"
+        for lean_file in src_dir.rglob("putnam*.lean"):
+            shutil.copy(lean_file, output_dir / lean_file.name)
 
 
 def save_data(data, output_file):
diff --git a/nemo_skills/dataset/ruler/prepare.py b/nemo_skills/dataset/ruler/prepare.py
index 9f15cd7f5f..2a1ffe6df1 100644
--- a/nemo_skills/dataset/ruler/prepare.py
+++ b/nemo_skills/dataset/ruler/prepare.py
@@ -25,13 +25,13 @@
 DEFAULT_SETTINGS = """
 DATASET_GROUP = "long-context"
 METRICS_TYPE = "ruler"
-EVAL_ARGS = "++eval_type=ruler ++eval_config.match_type={match_type}"
 GENERATION_ARGS = (
     "++prompt_config=generic/default "
     "++inference.tokens_to_generate={tokens_to_generate} "
     # ruler is adding prefix for assistant response, so it has to go through completions api
     "++start_assistant_response_key=generation "
     "++inference.endpoint_type=text "
+    "++eval_type=ruler ++eval_config.match_type={match_type} "
 )
 """
 TOKENS_TO_GENERATE = {"niah": 128, "vt": 30, "cwe": 120, "fwe": 50, "qa": 32}
diff --git a/nemo_skills/dataset/scicode/__init__.py b/nemo_skills/dataset/scicode/__init__.py
index 34cc48980f..10e61be369 100644
--- a/nemo_skills/dataset/scicode/__init__.py
+++ b/nemo_skills/dataset/scicode/__init__.py
@@ -15,9 +15,7 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "code"
 METRICS_TYPE = "scicode"
-# generation is a dictionary instead of string and remove_thinking is done during inference
-EVAL_ARGS = "++eval_type=scicode ++remove_thinking=False"
-GENERATION_ARGS = "++prompt_config=eval/scicode/default"
+GENERATION_ARGS = "++prompt_config=eval/scicode/default ++eval_type=scicode"
 GENERATION_MODULE = "nemo_skills.inference.eval.scicode"
 REQUIRES_SANDBOX = True
 EVAL_SPLIT = "test_aai"  # default to test + validation for consistency with AAI
diff --git a/nemo_skills/dataset/simpleqa/__init__.py b/nemo_skills/dataset/simpleqa/__init__.py
index cd1e7cbecd..d3829e4281 100644
--- a/nemo_skills/dataset/simpleqa/__init__.py
+++ b/nemo_skills/dataset/simpleqa/__init__.py
@@ -15,8 +15,7 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "simpleqa"
-EVAL_ARGS = "++eval_type=math "
-GENERATION_ARGS = "++prompt_config=generic/default "
+GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=math"
 EVAL_SPLIT = "verified"
 
 # SimpleQA requires judge model for evaluating factual accuracy
diff --git a/nemo_skills/dataset/supergpqa/__init__.py b/nemo_skills/dataset/supergpqa/__init__.py
index 53e9e6d85b..6100ac438c 100644
--- a/nemo_skills/dataset/supergpqa/__init__.py
+++ b/nemo_skills/dataset/supergpqa/__init__.py
@@ -17,6 +17,5 @@
 
 DATASET_GROUP = "multichoice"
 METRICS_TYPE = "multichoice"
-EVAL_ARGS = "++eval_type=multichoice"
 EVAL_SPLIT = "test"
-GENERATION_ARGS = "++prompt_config=eval/aai/mcq-10choices"
+GENERATION_ARGS = "++prompt_config=eval/aai/mcq-10choices ++eval_type=multichoice"
diff --git a/nemo_skills/dataset/svamp/__init__.py b/nemo_skills/dataset/svamp/__init__.py
index a1ba94a391..4d7235b152 100644
--- a/nemo_skills/dataset/svamp/__init__.py
+++ b/nemo_skills/dataset/svamp/__init__.py
@@ -15,5 +15,4 @@
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "math"
 METRICS_TYPE = "math"
-EVAL_ARGS = "++eval_type=math"
-GENERATION_ARGS = "++prompt_config=generic/math"
+GENERATION_ARGS = "++prompt_config=generic/math ++eval_type=math"
diff --git a/nemo_skills/dataset/swe-bench/__init__.py b/nemo_skills/dataset/swe-bench/__init__.py
index 6fa2d9a7a0..389086ce99 100644
--- a/nemo_skills/dataset/swe-bench/__init__.py
+++ b/nemo_skills/dataset/swe-bench/__init__.py
@@ -16,6 +16,5 @@
 EVAL_SPLIT = "default"
 DATASET_GROUP = "code"
 METRICS_TYPE = "swe-bench"
-EVAL_ARGS = "++eval_type=no-op"  # evaluation is fused with generation for efficiency
-GENERATION_ARGS = ""
+# evaluation is fused with generation for efficiency
 GENERATION_MODULE = "nemo_skills.inference.eval.swebench"
diff --git a/nemo_skills/dataset/utils.py b/nemo_skills/dataset/utils.py
index 7d488a3e8c..0b59a2b7db 100644
--- a/nemo_skills/dataset/utils.py
+++ b/nemo_skills/dataset/utils.py
@@ -190,9 +190,10 @@ def save_data_from_qwen(dataset, split="test"):
     )
 
     data_dir = Path(__file__).absolute().parent
-    original_file = str(data_dir / dataset / f"original_{split}.json")
+    ns_dataset = dataset if dataset != "math" else "hendrycks_math"
+    original_file = str(data_dir / ns_dataset / f"original_{split}.json")
     data_dir.mkdir(exist_ok=True)
-    output_file = str(data_dir / dataset / f"{split}.jsonl")
+    output_file = str(data_dir / ns_dataset / f"{split}.jsonl")
     data = []
     if not os.path.exists(original_file):
         formatted_url = url.format(split=split, dataset=dataset)
diff --git a/nemo_skills/dataset/wmt24pp/__init__.py b/nemo_skills/dataset/wmt24pp/__init__.py
index 86a7f76717..6c19ed4361 100644
--- a/nemo_skills/dataset/wmt24pp/__init__.py
+++ b/nemo_skills/dataset/wmt24pp/__init__.py
@@ -15,8 +15,6 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-PROMPT_CONFIG = "multilingual/segment-translation"
 DATASET_GROUP = "chat"
 METRICS_TYPE = "translation"
-EVAL_ARGS = "++eval_type=no-op"
-GENERATION_ARGS = ""
+GENERATION_ARGS = "++prompt_config=multilingual/segment-translation"
diff --git a/nemo_skills/evaluation/evaluate_results.py b/nemo_skills/evaluation/evaluate_results.py
deleted file mode 100644
index 10b1c89b29..0000000000
--- a/nemo_skills/evaluation/evaluate_results.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import logging
-import sys
-from dataclasses import field
-from typing import Any
-
-import hydra
-
-from nemo_skills.evaluation.evaluator import evaluate
-from nemo_skills.utils import (
-    get_help_message,
-    get_logger_name,
-    nested_dataclass,
-    remove_thinking,
-    setup_logging,
-    unroll_files,
-)
-
-LOG = logging.getLogger(get_logger_name(__file__))
-
-
-@nested_dataclass(kw_only=True)
-class EvaluateResultsConfig:
-    """Top-level parameters for the script"""
-
-    # list of files to evaluate. Can specify multiple patterns separated by space
-    # e.g. "path/to/file1.jsonl path/to/file2.jsonl" or with regex
-    # "test_dir/output-rs*.jsonl"
-    input_files: Any
-
-    eval_type: str
-    # the supported parameters are different depending on the eval configuration
-    # check graders.py for the supported eval types and their parameters
-    eval_config: dict = field(default_factory=dict)
-
-    # whether to remove the thinking part from the final output
-    remove_thinking: bool = True
-    thinking_begin: str = "<think>"
-    thinking_end: str = "</think>"
-    # generation key in the jsonl file
-    generation_key: str = "generation"
-
-    data_dir: str | None = None
-    split: str = "test"
-
-    def __post_init__(self):
-        if isinstance(self.input_files, str):
-            self.input_files = self.input_files.split(" ")
-
-
-cs = hydra.core.config_store.ConfigStore.instance()
-cs.store(name="base_evaluate_results_config", node=EvaluateResultsConfig)
-
-
-@hydra.main(version_base=None, config_name="base_evaluate_results_config")
-def evaluate_results(cfg: EvaluateResultsConfig):
-    cfg = EvaluateResultsConfig(_init_nested=True, **cfg)
-    LOG.info("Config used: %s", cfg)
-
-    if cfg.remove_thinking:
-        LOG.info(
-            f"Removing the thinking part from the {cfg.generation_key} key "
-            f"(using {cfg.thinking_begin} and {cfg.thinking_end} tokens). "
-            'Original content will be stored in "_full_generation" key.'
-        )
-        for jsonl_file in unroll_files(cfg.input_files):
-            with open(jsonl_file, encoding="utf-8") as f:
-                samples = [json.loads(line) for line in f]
-                for sample in samples:
-                    if cfg.generation_key not in sample:
-                        raise ValueError(
-                            f"Key {cfg.generation_key} not found in a sample, but remove_thinking=True is specified. "
-                            "Use generation_key parameter to specify the key containing the generations."
-                        )
-            with open(jsonl_file, "wt", encoding="utf-8") as f:
-                for sample in samples:
-                    remove_thinking(sample, cfg.generation_key, cfg.thinking_begin, cfg.thinking_end)
-                    f.write(json.dumps(sample) + "\n")
-
-    evaluate(cfg)
-
-
-HELP_MESSAGE = get_help_message(
-    EvaluateResultsConfig,
-)
-
-
-if __name__ == "__main__":
-    if "--help" in sys.argv or "-h" in sys.argv:
-        print(HELP_MESSAGE)
-    else:
-        setup_logging()
-        evaluate_results()
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
index d8d6656dd4..59ef5d8d6e 100644
--- a/nemo_skills/evaluation/evaluator/__init__.py
+++ b/nemo_skills/evaluation/evaluator/__init__.py
@@ -30,27 +30,19 @@
 from nemo_skills.evaluation.evaluator.livecodebench import eval_livecodebench
 from nemo_skills.evaluation.evaluator.math import (
     Lean4ProofEvaluator,
-    Lean4StatementEvaluator,
     MathEvaluator,
 )
 from nemo_skills.evaluation.evaluator.mcq import eval_mcq
 from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
-from nemo_skills.evaluation.evaluator.ojbench import eval_ojbench
 from nemo_skills.evaluation.evaluator.ruler import eval_ruler
 from nemo_skills.evaluation.evaluator.scicode import eval_scicode
 
-
-def dummy_eval(cfg):
-    return
-
-
 EVALUATOR_MAP = {
     # Function-based evaluators (batch-only)
     "evalplus": eval_evalplus,
     "if": eval_if,
     "ifbench": eval_ifbench,
     "bfcl": eval_bfcl,
-    "no-op": dummy_eval,
     "multichoice": eval_mcq,
     "ruler": eval_ruler,
     "livecodebench": eval_livecodebench,
@@ -59,7 +51,6 @@ def dummy_eval(cfg):
     "scicode": eval_scicode,
     "mrcr": eval_mrcr,
     "bigcodebench": eval_bigcodebench,
-    "ojbench": eval_ojbench,
     "human_eval_infilling": eval_human_eval_infilling,
 }
 
@@ -67,7 +58,6 @@ def dummy_eval(cfg):
 EVALUATOR_CLASS_MAP = {
     "math": MathEvaluator,
     "lean4-proof": Lean4ProofEvaluator,
-    "lean4-statement": Lean4StatementEvaluator,
     # Other evaluators can be added here as they're converted to classes
     "ioi": IOIEvaluator,
 }
@@ -117,18 +107,16 @@ def supports_single_eval(eval_type: str, config: Dict[str, Any]) -> bool:
     return evaluator.supports_single_eval()
 
 
-def evaluate(cfg):
+def evaluate(eval_type, eval_config):
     """Main evaluation function that handles both class-based and function-based evaluators."""
-    eval_type = cfg.eval_type
-
     # Check if it's a class-based evaluator first
     if eval_type in EVALUATOR_CLASS_MAP:
-        evaluator = get_evaluator_class(eval_type, cfg.eval_config)
-        return asyncio.run(evaluator.eval_full(cfg.input_files))
+        evaluator = get_evaluator_class(eval_type, eval_config)
+        return asyncio.run(evaluator.eval_full())
 
     # Fall back to function-based evaluator
     if eval_type in EVALUATOR_MAP:
-        return EVALUATOR_MAP[eval_type](cfg)
+        return EVALUATOR_MAP[eval_type](eval_config)
 
     # Not found in either map
     all_types = list(EVALUATOR_CLASS_MAP.keys()) + list(EVALUATOR_MAP.keys())
diff --git a/nemo_skills/evaluation/evaluator/arena.py b/nemo_skills/evaluation/evaluator/arena.py
index a5db88ed54..dd1d09135d 100644
--- a/nemo_skills/evaluation/evaluator/arena.py
+++ b/nemo_skills/evaluation/evaluator/arena.py
@@ -23,7 +23,7 @@
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 
-from nemo_skills.utils import get_logger_name, nested_dataclass
+from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -195,14 +195,3 @@ def get_aggregate_score(scores, weight=3):
         "invalid_scores": num_invalid,
     }
     return metrics
-
-
-@nested_dataclass(kw_only=True)
-class LlmEvaluatorConfig:
-    batch_size: int = 100  # lower if running into rate limits
-    tokens_to_generate: int = 4096  # will auto-lower to max possible for NGC models
-    use_batch_api: bool = True  # only supported for OpenAI models!
-    base_url: str = "https://api.openai.com/v1"
-    judge_model: str = JUDGE_MODEL
-    # defaults to True to avoid regenerating judgements unless necessary
-    skip_filled: bool = True
diff --git a/nemo_skills/evaluation/evaluator/base.py b/nemo_skills/evaluation/evaluator/base.py
index dc5cbb50c9..947fe117a5 100644
--- a/nemo_skills/evaluation/evaluator/base.py
+++ b/nemo_skills/evaluation/evaluator/base.py
@@ -16,11 +16,19 @@
 import json
 import os
 from abc import ABC
-from typing import Any, Dict, List
+from typing import Any, Dict
 
 import tqdm
 
-from nemo_skills.utils import unroll_files
+from nemo_skills.utils import nested_dataclass
+
+
+@nested_dataclass(kw_only=True)
+class BaseEvaluatorConfig:
+    # TODO: should we pass input_file separately everywhere?
+    input_file: str | None = None  # could be None for interleaved evals
+    data_dir: str | None = None
+    split: str = "test"
 
 
 class BaseEvaluator(ABC):
@@ -31,42 +39,37 @@ def __init__(self, config: Dict[str, Any], num_parallel_requests=10):
         self.config = config
         self.num_parallel_requests = num_parallel_requests
 
-    async def eval_full(self, input_files: List[str]) -> None:
-        """
-        Evaluate full dataset in batch mode.
-
-        Args:
-            input_files: List of input files to evaluate
-        """
+    async def eval_full(self) -> None:
+        """Evaluate full dataset in batch mode."""
         semaphore = asyncio.Semaphore(self.num_parallel_requests)
-        for input_file in tqdm.tqdm(unroll_files(input_files), desc="Processing files"):
-            # assume that input_file is small enough to entirely fit in the memory
-            async def process_line(line_data):
-                # Concurrency control and merge updates into original record
-                async with semaphore:
-                    updates = await self.eval_single(line_data)
-                    merged = dict(line_data)
-                    merged.update(updates)
-                    return merged
-
-            with open(input_file, "rt", encoding="utf-8") as fin:
-                tasks = []
-                for file_line in fin:
-                    line_dict = json.loads(file_line)
-                    task = asyncio.create_task(process_line(line_dict))
-                    tasks.append(task)
-
-            # Await tasks and write to temp file then replace original
-            temp_file = input_file + "-tmp"
-            with open(temp_file, "wt", encoding="utf-8") as f:
-                for task in tqdm.tqdm(
-                    tasks, total=len(tasks), desc=f"Completed Evaluation for {os.path.basename(input_file)}"
-                ):
-                    line = await task
-                    f.write(json.dumps(line) + "\n")
-
-            # Replace original with temp file
-            os.replace(temp_file, input_file)
+
+        # assume that input_file is small enough to entirely fit in the memory
+        async def process_line(line_data):
+            # Concurrency control and merge updates into original record
+            async with semaphore:
+                updates = await self.eval_single(line_data)
+                merged = dict(line_data)
+                merged.update(updates)
+                return merged
+
+        with open(self.config.input_file, "rt", encoding="utf-8") as fin:
+            tasks = []
+            for file_line in fin:
+                line_dict = json.loads(file_line)
+                task = asyncio.create_task(process_line(line_dict))
+                tasks.append(task)
+
+        # Await tasks and write to temp file then replace original
+        temp_file = self.config.input_file + "-tmp"
+        with open(temp_file, "wt", encoding="utf-8") as f:
+            for task in tqdm.tqdm(
+                tasks, total=len(tasks), desc=f"Completed Evaluation for {os.path.basename(self.config.input_file)}"
+            ):
+                line = await task
+                f.write(json.dumps(line) + "\n")
+
+        # Replace original with temp file
+        os.replace(temp_file, self.config.input_file)
 
     async def eval_single(self, data_point: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/nemo_skills/evaluation/evaluator/bfcl.py b/nemo_skills/evaluation/evaluator/bfcl.py
index 79fce64e83..d20f8aa01d 100644
--- a/nemo_skills/evaluation/evaluator/bfcl.py
+++ b/nemo_skills/evaluation/evaluator/bfcl.py
@@ -19,13 +19,14 @@
 import subprocess
 from pathlib import Path
 
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.utils import get_logger_name, nested_dataclass
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 @nested_dataclass(kw_only=True)
-class BFCLEvaluatorConfig:
+class BFCLEvaluatorConfig(BaseEvaluatorConfig):
     model: str = "o3-mini-2025-01-31-FC"  # Uses the same eval as Llama-Nemotron
     timeout: int = 300
 
@@ -36,38 +37,37 @@ def eval_bfcl(cfg):
     This function wraps the external BFCL evaluation tool, converting between
     Nemo-Skills format and BFCL format, then merging results back.
     """
-    eval_config = BFCLEvaluatorConfig(**cfg.eval_config)
+    eval_config = BFCLEvaluatorConfig(**cfg)
     model_name = eval_config.model.replace("/", "_")
-    # model_name = eval_config.model.split("/")[-1]
-    for jsonl_file in unroll_files(cfg.input_files):
-        # Output files are structures as bfcl_v3/TEST_CATEGORY/jsonl_file
-        test_category = str(Path(jsonl_file).absolute().parent.name).removeprefix("bfcl_v3.")
-
-        # Convert Nemo-Skills output file to BFCL format
-        output_dir = Path("/opt/gorilla/berkeley-function-call-leaderboard") / f"result/{model_name}"
-        score_file = (
-            Path("/opt/gorilla/berkeley-function-call-leaderboard")
-            / f"score/{model_name}"
-            / f"BFCL_v3_{test_category}_score.json"
-        )
-
-        bfcl_input_file = _convert_to_bfcl_format(jsonl_file, output_dir=output_dir, test_category=test_category)
-
-        try:
-            # Run BFCL evaluation using the CLI
-            # We need the OpenAI model class decoding functions for evaluation but not really the actual API key for evaluation
-            # So we set the API key to a dummy value
-            cmd = f"OPENAI_API_KEY=dummy bfcl evaluate --model {eval_config.model} --test-category {test_category}"
-
-            LOG.info(f"Running BFCL evaluation: {cmd}")
-            subprocess.run(cmd, shell=True, check=True, timeout=eval_config.timeout)
-
-            # Merge the bfcl_input_file with the score_file, and write to the original file
-            _merge_bfcl_results(jsonl_file, bfcl_input_file, score_file)
-
-        except subprocess.TimeoutExpired:
-            LOG.error(f"BFCL evaluation timed out after {eval_config.timeout} seconds")
-            raise
+    jsonl_file = eval_config.input_file
+    # Output files are structures as bfcl_v3/TEST_CATEGORY/jsonl_file
+    test_category = str(Path(jsonl_file).absolute().parent.name).removeprefix("bfcl_v3.")
+
+    # Convert Nemo-Skills output file to BFCL format
+    output_dir = Path("/opt/gorilla/berkeley-function-call-leaderboard") / f"result/{model_name}"
+    score_file = (
+        Path("/opt/gorilla/berkeley-function-call-leaderboard")
+        / f"score/{model_name}"
+        / f"BFCL_v3_{test_category}_score.json"
+    )
+
+    bfcl_input_file = _convert_to_bfcl_format(jsonl_file, output_dir=output_dir, test_category=test_category)
+
+    try:
+        # Run BFCL evaluation using the CLI
+        # We need the OpenAI model class decoding functions for evaluation but not really the actual API key for evaluation
+        # So we set the API key to a dummy value
+        cmd = f"OPENAI_API_KEY=dummy bfcl evaluate --model {eval_config.model} --test-category {test_category}"
+
+        LOG.info(f"Running BFCL evaluation: {cmd}")
+        subprocess.run(cmd, shell=True, check=True, timeout=eval_config.timeout)
+
+        # Merge the bfcl_input_file with the score_file, and write to the original file
+        _merge_bfcl_results(jsonl_file, bfcl_input_file, score_file)
+
+    except subprocess.TimeoutExpired:
+        LOG.error(f"BFCL evaluation timed out after {eval_config.timeout} seconds")
+        raise
 
 
 def _convert_to_bfcl_format(jsonl_file, output_dir, test_category):
diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py
index e221019889..c3f626ce52 100644
--- a/nemo_skills/evaluation/evaluator/code.py
+++ b/nemo_skills/evaluation/evaluator/code.py
@@ -20,11 +20,12 @@
 import subprocess
 import sys
 from argparse import Namespace
+from dataclasses import field
 
 from omegaconf import OmegaConf
 
-from nemo_skills.file_utils import unroll_files
-from nemo_skills.utils import get_logger_name
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.utils import get_logger_name, nested_dataclass
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -100,45 +101,51 @@ def install_from_git(git_url):
         print(f"Error during installation: {e}")
 
 
+@nested_dataclass(kw_only=True)
+class EvalPlusEvaluatorConfig(BaseEvaluatorConfig):
+    # evalplus specific configurations
+    evalplus: dict = field(default_factory=dict)
+
+
 def eval_evalplus(cfg):
+    cfg = EvalPlusEvaluatorConfig(**cfg)
     # TODO: need to move it to a separate docker (either our sandbox or separate srun)
     from evalplus.evaluate import evaluate
 
-    # processing each generation separately (TODO: evalplus can do it together, but need to figure out the format)
-    for jsonl_file in unroll_files(cfg.input_files):
-        with open(jsonl_file) as f:
-            samples = [preprocess_code(json.loads(line)) for line in f]
-        # all changes will be done with a new key "completion", so it's ok to write to the same file
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-        eval_config = {
-            "samples": jsonl_file,
-            "base_only": False,
-            "parallel": None,
-            "i_just_wanna_run": False,
-            "test_details": False,
-            "min_time_limit": 1,
-            "gt_time_limit_factor": 4.0,
-            "mini": False,
-            "noextreme": False,
-            "version": "default",
-        }
-        eval_config.update(OmegaConf.to_container(cfg.eval_config))
-        evaluate(Namespace(**eval_config))
-        with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
-            evalplus_grades = json.load(fin)
-        # adding is_correct key to allow compute_metrics to work
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                sample["is_correct"] = evalplus_grades["eval"][sample["task_id"]][0]["base_status"] == "pass"
-                sample["is_correct-plus"] = (
-                    sample["is_correct"] and evalplus_grades["eval"][sample["task_id"]][0]["plus_status"] == "pass"
-                )
-                f.write(json.dumps(sample) + "\n")
-
-        # moving eval file as otherwise evalplus does not want to recompute metrics if it's present..
-        shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
+    jsonl_file = cfg.input_file
+    with open(jsonl_file) as f:
+        samples = [preprocess_code(json.loads(line)) for line in f]
+    # all changes will be done with a new key "completion", so it's ok to write to the same file
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
+    eval_config = {
+        "samples": jsonl_file,
+        "base_only": False,
+        "parallel": None,
+        "i_just_wanna_run": False,
+        "test_details": False,
+        "min_time_limit": 1,
+        "gt_time_limit_factor": 4.0,
+        "mini": False,
+        "noextreme": False,
+        "version": "default",
+    }
+    eval_config.update(OmegaConf.to_container(cfg.evalplus))
+    evaluate(Namespace(**eval_config))
+    with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
+        evalplus_grades = json.load(fin)
+    # adding is_correct key to allow compute_metrics to work
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            sample["is_correct"] = evalplus_grades["eval"][sample["task_id"]][0]["base_status"] == "pass"
+            sample["is_correct-plus"] = (
+                sample["is_correct"] and evalplus_grades["eval"][sample["task_id"]][0]["plus_status"] == "pass"
+            )
+            f.write(json.dumps(sample) + "\n")
+
+    # moving eval file as otherwise evalplus does not want to recompute metrics if it's present..
+    shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
 
 
 def install_requirements(url):
@@ -150,20 +157,22 @@ def install_requirements(url):
 
 
 def eval_livecodebench_pro(cfg):
-    for jsonl_file in unroll_files(cfg.input_files):
-        with open(jsonl_file) as f:
-            samples = [preprocess_code(json.loads(line), "python") for line in f]
-            for sample in samples:
-                sample["problem_id"] = sample.pop("task_id")
-                sample["text_response"] = sample.pop("completion")
-                sample["response_meta"] = None
+    cfg = BaseEvaluatorConfig(**cfg)
+    jsonl_file = cfg.input_file
+    with open(jsonl_file) as f:
+        samples = [preprocess_code(json.loads(line), "python") for line in f]
+        for sample in samples:
+            sample["problem_id"] = sample.pop("task_id")
+            sample["text_response"] = sample.pop("completion")
+            sample["response_meta"] = None
 
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
 
 
 def eval_livebench_coding(cfg):
+    cfg = BaseEvaluatorConfig(**cfg)
     try:
         from livecodebench.evaluate import evaluate
     except ImportError:
@@ -175,43 +184,43 @@ def eval_livebench_coding(cfg):
             LOG.info("Failed to install 'livecodebench'. Please install it manually.")
             raise
 
-    for jsonl_file in unroll_files(cfg.input_files):
-        samples = []
-        with open(jsonl_file) as f:
-            for line in f:
-                sample = json.loads(line)
-                if sample["task"] == "coding_completion":
-                    assert len(sample["partial_solution"]) > 0
-                    sample = preprocess_code(sample, strip_whitespace=False)
-                    sample["completion"] = sample["completion"].replace("\t", "    ")
-                    full_solution = sample["partial_solution"] + "\n" + sample["completion"]
-                    sample["code_list"] = [full_solution]
-                else:
-                    sample = preprocess_code(sample, strip_whitespace=True)
-                    sample["code_list"] = [sample["completion"]]
-
-                samples.append(sample)
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-        evaluate(
-            custom_output_file=jsonl_file,
-            k_list=[1],
-            num_process_evaluate=12,
-            timeout=6,
-        )
-
-        with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
-            eval_grades = json.load(fin)
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                sample["graded_list"] = eval_grades["eval"][sample["question_id"]]["graded_list"]
-                f.write(json.dumps(sample) + "\n")
-
-        # moving eval file to ensure metrics are recomputed
-        shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
+    jsonl_file = cfg.input_file
+    samples = []
+    with open(jsonl_file) as f:
+        for line in f:
+            sample = json.loads(line)
+            if sample["task"] == "coding_completion":
+                assert len(sample["partial_solution"]) > 0
+                sample = preprocess_code(sample, strip_whitespace=False)
+                sample["completion"] = sample["completion"].replace("\t", "    ")
+                full_solution = sample["partial_solution"] + "\n" + sample["completion"]
+                sample["code_list"] = [full_solution]
+            else:
+                sample = preprocess_code(sample, strip_whitespace=True)
+                sample["code_list"] = [sample["completion"]]
+
+            samples.append(sample)
+
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
+
+    evaluate(
+        custom_output_file=jsonl_file,
+        k_list=[1],
+        num_process_evaluate=12,
+        timeout=6,
+    )
+
+    with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
+        eval_grades = json.load(fin)
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            sample["graded_list"] = eval_grades["eval"][sample["question_id"]]["graded_list"]
+            f.write(json.dumps(sample) + "\n")
+
+    # moving eval file to ensure metrics are recomputed
+    shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
 
 
 def install_or_upgrade_package(package_name):
@@ -224,6 +233,7 @@ def install_or_upgrade_package(package_name):
 
 
 def eval_bigcodebench(cfg):
+    cfg = BaseEvaluatorConfig(**cfg)
     try:
         from bigcodebench.evaluate import evaluate
     except ImportError:
@@ -238,50 +248,49 @@ def eval_bigcodebench(cfg):
             raise
 
     data_split = None
-    for jsonl_file in unroll_files(cfg.input_files):
-        samples = []
-        with open(jsonl_file) as f:
-            for line in f:
-                generation_dict = preprocess_code(json.loads(line))
-                generation_dict["solution"] = generation_dict.pop("completion")
-                samples.append(generation_dict)
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-                if data_split is None:
-                    data_split = sample["split"]
-                elif data_split != sample["split"]:
-                    raise ValueError(
-                        f"All samples should have the same split, but got {data_split} and {sample['split']}"
-                    )
-
-        # https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py#L117
-        # if the input filename is "output.jsonl"
-        # then there will be two output files (generated) after evaluation:
-        # "output_eval_results-saved.json"
-        # "output_pass_at_k.json"
-        evaluate(
-            "instruct",
-            data_split,  # full, hard
-            samples=jsonl_file,
-            execution="local",
-            pass_k="1",
-            calibrated=True,
-            save_pass_rate=True,  # saves pass_at_k results in file: "output_pass_at_k.json"
-        )
-
-        with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
-            eval_grades = json.load(fin)
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                sample["status"] = eval_grades["eval"][sample["task_id"]][0]["status"]
-                f.write(json.dumps(sample) + "\n")
-
-        # moving eval file to ensure metrics are recomputed
-        shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
+    jsonl_file = cfg.input_file
+    samples = []
+    with open(jsonl_file) as f:
+        for line in f:
+            generation_dict = preprocess_code(json.loads(line))
+            generation_dict["solution"] = generation_dict.pop("completion")
+            samples.append(generation_dict)
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
+            if data_split is None:
+                data_split = sample["split"]
+            elif data_split != sample["split"]:
+                raise ValueError(f"All samples should have the same split, but got {data_split} and {sample['split']}")
+
+    # https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py#L117
+    # if the input filename is "output.jsonl"
+    # then there will be two output files (generated) after evaluation:
+    # "output_eval_results-saved.json"
+    # "output_pass_at_k.json"
+    evaluate(
+        "instruct",
+        data_split,  # full, hard
+        samples=jsonl_file,
+        execution="local",
+        pass_k="1",
+        calibrated=True,
+        save_pass_rate=True,  # saves pass_at_k results in file: "output_pass_at_k.json"
+    )
+
+    with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
+        eval_grades = json.load(fin)
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            sample["status"] = eval_grades["eval"][sample["task_id"]][0]["status"]
+            f.write(json.dumps(sample) + "\n")
+
+    # moving eval file to ensure metrics are recomputed
+    shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
 
 
 def eval_human_eval_infilling(cfg):
+    cfg = BaseEvaluatorConfig(**cfg)
     try:
         from human_eval_infilling.evaluate import evaluate
     except ImportError:
@@ -318,34 +327,32 @@ def postprocess_code(sample):
         return sample
 
     data_split = None
-    for jsonl_file in unroll_files(cfg.input_files):
-        samples = []
-        with open(jsonl_file) as f:
-            for line in f:
-                sample = json.loads(line)
-                if data_split is None:
-                    data_split = sample["split"]
-                elif data_split != sample["split"]:
-                    raise ValueError(
-                        f"All samples should have the same split, but got {data_split} and {sample['split']}"
-                    )
-
-                sample = preprocess_code(sample, strip_whitespace=False)
-                sample["original_completion"] = sample["completion"]
-                sample = postprocess_code(sample)
-                samples.append(sample)
-
-        # all changes will be done with a new key "completion", so it's ok to write to the same file
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-        evaluate(data_split, jsonl_file, k=[1], n_workers=4, timeout=3.0)
-
-        with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
-            eval_grades = json.load(fin)
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f_out:
-            for s in samples:
-                s["passed"] = eval_grades["eval"][s["task_id"]][0]["passed"]
-                f_out.write(json.dumps(s) + "\n")
+    jsonl_file = cfg.input_file
+    samples = []
+    with open(jsonl_file) as f:
+        for line in f:
+            sample = json.loads(line)
+            if data_split is None:
+                data_split = sample["split"]
+            elif data_split != sample["split"]:
+                raise ValueError(f"All samples should have the same split, but got {data_split} and {sample['split']}")
+
+            sample = preprocess_code(sample, strip_whitespace=False)
+            sample["original_completion"] = sample["completion"]
+            sample = postprocess_code(sample)
+            samples.append(sample)
+
+    # all changes will be done with a new key "completion", so it's ok to write to the same file
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
+
+    evaluate(data_split, jsonl_file, k=[1], n_workers=4, timeout=3.0)
+
+    with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
+        eval_grades = json.load(fin)
+
+    with open(jsonl_file, "wt", encoding="utf-8") as f_out:
+        for s in samples:
+            s["passed"] = eval_grades["eval"][s["task_id"]][0]["passed"]
+            f_out.write(json.dumps(s) + "\n")
diff --git a/nemo_skills/evaluation/evaluator/ifbench.py b/nemo_skills/evaluation/evaluator/ifbench.py
index 2b7e06ca0f..537bb92e07 100644
--- a/nemo_skills/evaluation/evaluator/ifbench.py
+++ b/nemo_skills/evaluation/evaluator/ifbench.py
@@ -18,40 +18,42 @@
 import subprocess
 from pathlib import Path
 
-from nemo_skills.utils import get_logger_name, unroll_files
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 def eval_ifbench(cfg):
-    for jsonl_file in unroll_files(cfg.input_files):
-        jsonl_path = Path(jsonl_file).resolve()
-        output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        cmd = (
-            "cd /opt/benchmarks/IFBench && python -m run_eval "
-            f"--input_data={jsonl_file} "
-            f"--input_response_data={jsonl_file} "
-            f"--output_dir={output_dir} "
-        )
-        subprocess.run(cmd, shell=True, check=True)
-        # fusing eval metrics back into the generation file
-        with open(jsonl_file, "rt", encoding="utf-8") as f:
-            samples = [json.loads(line) for line in f]
-
-        with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
-            eval_results = [json.loads(line) for line in f]
-        for sample, eval_result in zip(samples, eval_results):
-            sample["loose_eval"] = eval_result
-
-        with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
-            eval_results = [json.loads(line) for line in f]
-        for sample, eval_result in zip(samples, eval_results):
-            sample["strict_eval"] = eval_result
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-        # removing temporary metric directory to avoid reusing it
-        shutil.rmtree(output_dir)
+    cfg = BaseEvaluatorConfig(**cfg)
+    jsonl_file = cfg.input_file
+    jsonl_path = Path(jsonl_file).resolve()
+    output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cmd = (
+        "cd /opt/benchmarks/IFBench && python -m run_eval "
+        f"--input_data={jsonl_file} "
+        f"--input_response_data={jsonl_file} "
+        f"--output_dir={output_dir} "
+    )
+    subprocess.run(cmd, shell=True, check=True)
+    # fusing eval metrics back into the generation file
+    with open(jsonl_file, "rt", encoding="utf-8") as f:
+        samples = [json.loads(line) for line in f]
+
+    with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
+        eval_results = [json.loads(line) for line in f]
+    for sample, eval_result in zip(samples, eval_results):
+        sample["loose_eval"] = eval_result
+
+    with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
+        eval_results = [json.loads(line) for line in f]
+    for sample, eval_result in zip(samples, eval_results):
+        sample["strict_eval"] = eval_result
+
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
+
+    # removing temporary metric directory to avoid reusing it
+    shutil.rmtree(output_dir)
diff --git a/nemo_skills/evaluation/evaluator/ifeval.py b/nemo_skills/evaluation/evaluator/ifeval.py
index ecf4e1f1d0..3708737994 100644
--- a/nemo_skills/evaluation/evaluator/ifeval.py
+++ b/nemo_skills/evaluation/evaluator/ifeval.py
@@ -18,40 +18,42 @@
 import subprocess
 from pathlib import Path
 
-from nemo_skills.utils import get_logger_name, unroll_files
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 def eval_if(cfg):
-    for jsonl_file in unroll_files(cfg.input_files):
-        jsonl_path = Path(jsonl_file).resolve()
-        output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        cmd = (
-            "cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main "
-            f"--input_data={jsonl_file} "
-            f"--input_response_data={jsonl_file} "
-            f"--output_dir={output_dir} "
-        )
-        subprocess.run(cmd, shell=True, check=True)
-        # fusing eval metrics back into the generation file
-        with open(jsonl_file, "rt", encoding="utf-8") as f:
-            samples = [json.loads(line) for line in f]
-
-        with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
-            eval_results = [json.loads(line) for line in f]
-        for sample, eval_result in zip(samples, eval_results):
-            sample["loose_eval"] = eval_result
-
-        with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
-            eval_results = [json.loads(line) for line in f]
-        for sample, eval_result in zip(samples, eval_results):
-            sample["strict_eval"] = eval_result
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-        # removing temporary metric directory to avoid reusing it
-        shutil.rmtree(output_dir)
+    cfg = BaseEvaluatorConfig(**cfg)
+    jsonl_file = cfg.input_file
+    jsonl_path = Path(jsonl_file).resolve()
+    output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cmd = (
+        "cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main "
+        f"--input_data={jsonl_file} "
+        f"--input_response_data={jsonl_file} "
+        f"--output_dir={output_dir} "
+    )
+    subprocess.run(cmd, shell=True, check=True)
+    # fusing eval metrics back into the generation file
+    with open(jsonl_file, "rt", encoding="utf-8") as f:
+        samples = [json.loads(line) for line in f]
+
+    with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
+        eval_results = [json.loads(line) for line in f]
+    for sample, eval_result in zip(samples, eval_results):
+        sample["loose_eval"] = eval_result
+
+    with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
+        eval_results = [json.loads(line) for line in f]
+    for sample, eval_result in zip(samples, eval_results):
+        sample["strict_eval"] = eval_result
+
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
+
+    # removing temporary metric directory to avoid reusing it
+    shutil.rmtree(output_dir)
diff --git a/nemo_skills/evaluation/evaluator/ioi.py b/nemo_skills/evaluation/evaluator/ioi.py
index c883b96e0b..239a23db6c 100644
--- a/nemo_skills/evaluation/evaluator/ioi.py
+++ b/nemo_skills/evaluation/evaluator/ioi.py
@@ -21,13 +21,13 @@
 from typing import Dict
 
 from nemo_skills.code_execution.sandbox import LocalSandbox
-from nemo_skills.evaluation.evaluator.base import BaseEvaluator
+from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig
 from nemo_skills.file_utils import jdump
-from nemo_skills.utils import nested_dataclass, unroll_files
+from nemo_skills.utils import nested_dataclass
 
 
 @nested_dataclass(kw_only=True)
-class IOIEvaluatorConfig:
+class IOIEvaluatorConfig(BaseEvaluatorConfig):
     test_file: str = "test_metadata.json"
     num_workers: int = 16  # number of test workers
     test_batch_size: int = 16  # number of tests to run concurrently
@@ -374,19 +374,19 @@ async def _evaluate_entry(self, entry: dict) -> dict:
             "test_case_results": test_case_results,
         }
 
-    async def eval_full(self, input_files):  # type: ignore[override]
-        for jsonl_file in unroll_files(input_files):
-            with open(jsonl_file, "r", encoding="utf-8") as f:
-                all_samples = [json.loads(line) for line in f]
+    async def eval_full(self):  # type: ignore[override]
+        jsonl_file = self.eval_cfg.input_file
+        with open(jsonl_file, "r", encoding="utf-8") as f:
+            all_samples = [json.loads(line) for line in f]
 
-            tasks = [self._evaluate_entry(s) for s in all_samples]
-            outputs = await asyncio.gather(*tasks)
+        tasks = [self._evaluate_entry(s) for s in all_samples]
+        outputs = await asyncio.gather(*tasks)
 
-            for s, o in zip(all_samples, outputs):
-                s["test_case_results"] = o["test_case_results"]
-                s["eval_status"] = o["eval_status"]
+        for s, o in zip(all_samples, outputs):
+            s["test_case_results"] = o["test_case_results"]
+            s["eval_status"] = o["eval_status"]
 
-            jdump(all_samples, jsonl_file, mode="wt")
+        jdump(all_samples, jsonl_file, mode="wt")
 
         if self.pool is not None:
             self.pool.close()
diff --git a/nemo_skills/evaluation/evaluator/livecodebench.py b/nemo_skills/evaluation/evaluator/livecodebench.py
index 0c852b9b7f..5f6e88f9f1 100644
--- a/nemo_skills/evaluation/evaluator/livecodebench.py
+++ b/nemo_skills/evaluation/evaluator/livecodebench.py
@@ -27,8 +27,9 @@
 import httpx
 
 from nemo_skills.code_execution.sandbox import Sandbox, get_sandbox
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
 from nemo_skills.evaluation.evaluator.code import preprocess_code
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+from nemo_skills.utils import get_logger_name, nested_dataclass
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -37,7 +38,7 @@
 
 
 @nested_dataclass(kw_only=True)
-class LiveCodeBenchEvaluatorConfig:
+class LiveCodeBenchEvaluatorConfig(BaseEvaluatorConfig):
     sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"})
     language: str = "python"  # use either "python" or "cpp"
     test_file: str = None
@@ -202,86 +203,83 @@ def _install_packages_locally(interpreter: str):
             raise
 
 
-async def eval_livecodebench_async(cfg, eval_config: LiveCodeBenchEvaluatorConfig):
+async def eval_livecodebench_async(eval_config: LiveCodeBenchEvaluatorConfig):
     """Evaluation running within a sandbox."""
     async with sandbox_context(eval_config.sandbox) as sandbox:
         if not await _install_packages_in_sandbox(sandbox, eval_config):
             return
 
-        for jsonl_file in unroll_files(cfg.input_files):
-            LOG.info(f"Processing file: {jsonl_file} in sandbox")
-            try:
-                samples, release_version = _preprocess_and_validate_file(jsonl_file, eval_config.language)
-            except ValueError as e:
-                LOG.error(f"Skipping {jsonl_file} due to pre-processing error: {e}")
-                continue
-
-            if eval_config.language == "python":
-                release_version = f"release_{release_version}"
-            test_file_arg = repr(eval_config.test_file) if eval_config.test_file else "None"
-            eval_code = textwrap.dedent(f"""
-                from livecodebench.evaluate import evaluate
-                evaluate(
-                    custom_output_file='{jsonl_file}',
-                    release_version='{release_version}',
-                    test_file={test_file_arg},
-                    k_list=[1],
-                    language='{eval_config.language}',
-                    num_process_evaluate={eval_config.num_processes},
-                    timeout={eval_config.timeout}
-                )
-            """)
-
-            cmd = f"{eval_config.interpreter} -c {shlex.quote(eval_code)}"
-            output, _ = await execute_in_sandbox_with_retries(
-                sandbox,
-                eval_config.num_retries,
-                cmd,
-                language="shell",
-                timeout=eval_config.timeout * len(samples) + eval_config.timeout_buffer,
-                max_output_characters=100_000,
+        jsonl_file = eval_config.input_file
+        LOG.info(f"Processing file: {jsonl_file} in sandbox")
+        try:
+            samples, release_version = _preprocess_and_validate_file(jsonl_file, eval_config.language)
+        except ValueError as e:
+            LOG.error(f"Skipping {jsonl_file} due to pre-processing error: {e}")
+
+        if eval_config.language == "python":
+            release_version = f"release_{release_version}"
+        test_file_arg = repr(eval_config.test_file) if eval_config.test_file else "None"
+        eval_code = textwrap.dedent(f"""
+            from livecodebench.evaluate import evaluate
+            evaluate(
+                custom_output_file='{jsonl_file}',
+                release_version='{release_version}',
+                test_file={test_file_arg},
+                k_list=[1],
+                language='{eval_config.language}',
+                num_process_evaluate={eval_config.num_processes},
+                timeout={eval_config.timeout}
             )
+        """)
+
+        cmd = f"{eval_config.interpreter} -c {shlex.quote(eval_code)}"
+        output, _ = await execute_in_sandbox_with_retries(
+            sandbox,
+            eval_config.num_retries,
+            cmd,
+            language="shell",
+            timeout=eval_config.timeout * len(samples) + eval_config.timeout_buffer,
+            max_output_characters=100_000,
+        )
 
-            if output.get("process_status") != "completed":
-                LOG.error(f"Evaluation failed for {jsonl_file}. Stderr: {output.get('stderr')}")
-                continue
+        if output.get("process_status") != "completed":
+            LOG.error(f"Evaluation failed for {jsonl_file}. Stderr: {output.get('stderr')}")
 
-            _postprocess_results(jsonl_file, samples)
+        _postprocess_results(jsonl_file, samples)
 
 
-def eval_livecodebench_without_sandbox(cfg, eval_config: LiveCodeBenchEvaluatorConfig):
+def eval_livecodebench_without_sandbox(eval_config: LiveCodeBenchEvaluatorConfig):
     """Evaluation running on the local machine."""
     evaluate_fn = _install_packages_locally(eval_config.interpreter)
     if not evaluate_fn:
         return
 
-    for jsonl_file in unroll_files(cfg.input_files):
-        LOG.info(f"Processing file: {jsonl_file} locally")
-        try:
-            samples, release_version = _preprocess_and_validate_file(jsonl_file, eval_config.language)
-        except ValueError as e:
-            LOG.error(f"Skipping {jsonl_file} due to pre-processing error: {e}")
-            continue
-
-        if eval_config.language == "python":
-            release_version = f"release_{release_version}"
-
-        evaluate_fn(
-            custom_output_file=jsonl_file,
-            release_version=release_version,
-            k_list=[1],
-            language=eval_config.language,
-            test_file=eval_config.test_file,
-            num_process_evaluate=eval_config.num_processes,
-            timeout=eval_config.timeout,
-        )
+    jsonl_file = eval_config.input_file
+    LOG.info(f"Processing file: {jsonl_file} locally")
+    try:
+        samples, release_version = _preprocess_and_validate_file(jsonl_file, eval_config.language)
+    except ValueError as e:
+        LOG.error(f"Skipping {jsonl_file} due to pre-processing error: {e}")
+
+    if eval_config.language == "python":
+        release_version = f"release_{release_version}"
+
+    evaluate_fn(
+        custom_output_file=jsonl_file,
+        release_version=release_version,
+        k_list=[1],
+        language=eval_config.language,
+        test_file=eval_config.test_file,
+        num_process_evaluate=eval_config.num_processes,
+        timeout=eval_config.timeout,
+    )
 
-        _postprocess_results(jsonl_file, samples)
+    _postprocess_results(jsonl_file, samples)
 
 
 def eval_livecodebench(cfg):
     """Main entry point for LiveCodeBench evaluation."""
-    eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config)
+    eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg)
 
     if eval_config.language == "python" and eval_config.interpreter not in ["python", "pypy3"]:
         raise ValueError("Python interpreter must be 'python' or 'pypy3'.")
@@ -297,6 +295,6 @@ def eval_livecodebench(cfg):
         raise RuntimeError("The 'pypy3' interpreter requires a running sandbox, but the service was unreachable.")
 
     if sandbox_is_ready:
-        asyncio.run(eval_livecodebench_async(cfg, eval_config))
+        asyncio.run(eval_livecodebench_async(eval_config))
     else:
-        eval_livecodebench_without_sandbox(cfg, eval_config)
+        eval_livecodebench_without_sandbox(eval_config)
diff --git a/nemo_skills/evaluation/evaluator/math.py b/nemo_skills/evaluation/evaluator/math.py
index c22e790f33..8437968b1a 100644
--- a/nemo_skills/evaluation/evaluator/math.py
+++ b/nemo_skills/evaluation/evaluator/math.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import logging
-from dataclasses import asdict, field
+from dataclasses import field
 
 from nemo_skills.code_execution.proof_utils import (
     ProofBuildConfig,
@@ -21,15 +21,15 @@
     determine_proof_status,
 )
 from nemo_skills.code_execution.sandbox import get_sandbox
-from nemo_skills.evaluation.evaluator.base import BaseEvaluator
-from nemo_skills.evaluation.math_grader import evaluate_result
+from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig
+from nemo_skills.evaluation.math_grader import extract_answer, math_equal
 from nemo_skills.utils import get_logger_name, nested_dataclass
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 @nested_dataclass(kw_only=True)
-class MathEvaluatorConfig:
+class MathEvaluatorConfig(BaseEvaluatorConfig):
     numeric_precision: int = 15
     timeout: int = 10
     # if True will not attempt to re-extract based on \boxed or regex
@@ -42,7 +42,7 @@ class MathEvaluatorConfig:
 
 
 @nested_dataclass(kw_only=True)
-class LeanEvaluatorConfig:
+class LeanEvaluatorConfig(BaseEvaluatorConfig):
     sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"})
     num_parallel_requests: int = 10
     timeout: float = 30.0
@@ -52,18 +52,36 @@ class LeanEvaluatorConfig:
     extract_code_mode: str = "last"
 
 
-# Evaluator Classes
-
-
 class MathEvaluator(BaseEvaluator):
     def __init__(self, config: dict, num_parallel_requests=10):
         super().__init__(config, num_parallel_requests)
         self.eval_config = MathEvaluatorConfig(**self.config)
-        self.eval_config_dict = asdict(self.eval_config)
 
     async def eval_single(self, data_point: dict[str, any]) -> dict[str, any]:
         """Evaluate single problem for math"""
-        return evaluate_result(data_point, **self.eval_config_dict)
+        if not self.eval_config.use_predicted_answer_key:
+            data_point["predicted_answer"] = extract_answer(
+                data_point["generation"],
+                extract_from_boxed=self.eval_config.extract_from_boxed,
+                extract_regex=self.eval_config.extract_regex,
+            )
+        else:
+            if "predicted_answer" not in data_point:
+                raise ValueError(
+                    "predicted_answer key not found in the data_point. Set use_predicted_answer_key=False to re-extract"
+                )
+
+        gt_answer = data_point["expected_answer"]
+        predicted_answer = data_point["predicted_answer"]
+
+        data_point["symbolic_correct"] = math_equal(
+            gt_answer,
+            predicted_answer,
+            take_modulo=self.eval_config.take_modulo,
+            numeric_precision=self.eval_config.numeric_precision,
+            timeout_seconds=self.eval_config.timeout,
+        )
+        return data_point
 
 
 class Lean4ProofEvaluator(BaseEvaluator):
@@ -72,9 +90,8 @@ class Lean4ProofEvaluator(BaseEvaluator):
     def __init__(self, config: dict, num_parallel_requests=10):
         """Initialize Lean4ProofEvaluator with sandbox."""
         super().__init__(config, num_parallel_requests)
-        eval_config = LeanEvaluatorConfig(**self.config)
-        self.sandbox = get_sandbox(**eval_config.sandbox)
-        self.eval_config = eval_config
+        self.eval_config = LeanEvaluatorConfig(**self.config)
+        self.sandbox = get_sandbox(**self.eval_config.sandbox)
 
     async def eval_single(self, data_point: dict[str, any]) -> dict[str, any]:
         """Evaluate single Lean4 proof during generation."""
@@ -108,24 +125,3 @@ async def eval_single(self, data_point: dict[str, any]) -> dict[str, any]:
             "proof_status": proof_status,
             "lean_evaluation": {**output, "timeout": self.eval_config.timeout},
         }
-
-
-class Lean4StatementEvaluator(BaseEvaluator):
-    """Lean4 statement evaluator - only supports batch evaluation."""
-
-    def __init__(self, config: dict, num_parallel_requests=10):
-        """Initialize Lean4StatementEvaluator with sandbox."""
-        super().__init__(config, num_parallel_requests)
-        eval_config = LeanEvaluatorConfig(**self.config)
-        self.sandbox = get_sandbox(**eval_config.sandbox)
-        self.eval_config = eval_config
-
-    async def eval_full(self, input_files: list[str]) -> None:
-        """Batch evaluate Lean4 statements."""
-        eval_config_dict = asdict(self.eval_config)
-        eval_config_dict.pop("sandbox")
-        await self.sandbox.batch_evaluate_results(
-            input_files=input_files,
-            answer_format="lean4-statement",
-            **eval_config_dict,
-        )
diff --git a/nemo_skills/evaluation/evaluator/mcq.py b/nemo_skills/evaluation/evaluator/mcq.py
index e057fb3a12..b1974effcc 100644
--- a/nemo_skills/evaluation/evaluator/mcq.py
+++ b/nemo_skills/evaluation/evaluator/mcq.py
@@ -18,22 +18,22 @@
 
 from tqdm import tqdm
 
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
 from nemo_skills.evaluation.math_grader import extract_answer
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+from nemo_skills.utils import get_logger_name, nested_dataclass
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 @nested_dataclass(kw_only=True)
-class MCQEvaluatorConfig:
+class MCQEvaluatorConfig(BaseEvaluatorConfig):
     extract_from_boxed: bool = True
     # only used if extract_from_boxed is False
     extract_regex: str = r"The final answer is (.+)$"
 
 
 def eval_mcq(cfg):
-    # Create config from cfg.eval_config (following pattern from other evaluators)
-    eval_config = MCQEvaluatorConfig(**cfg.eval_config)
+    eval_config = MCQEvaluatorConfig(**cfg)
 
     def extract_letter(text, extract_from_boxed: bool = True, extract_regex: str = r"The final answer is (.+)$"):
         # extract prediction from boxed{} or regex
@@ -56,21 +56,22 @@ def extract_letter(text, extract_from_boxed: bool = True, extract_regex: str = r
                 parsed_letter = match[-1].strip().upper()
 
         LOG.info(
-            f"Final parsed letter: {parsed_letter}, extract_from_boxed: {extract_from_boxed}, extract_regex: {extract_regex}, extracted_answer: {extracted_answer}"
+            f"Final parsed letter: {parsed_letter}, extract_from_boxed: {extract_from_boxed}, "
+            f"extract_regex: {extract_regex}, extracted_answer: {extracted_answer}"
         )
 
         return parsed_letter
 
-    for file in unroll_files(cfg.input_files):
-        with open(file, "rt", encoding="utf-8") as fin:
-            data = [json.loads(line) for line in fin]
-        with open(file, "wt", encoding="utf-8") as fout:
-            for sample in tqdm(data):
-                # Per-sample values override config defaults for backward compatibility
-                extract_from_boxed = sample.get("extract_from_boxed", eval_config.extract_from_boxed)
-                extract_regex = sample.get("extract_regex", eval_config.extract_regex)
-                sample["predicted_answer"] = extract_letter(
-                    sample["generation"], extract_from_boxed=extract_from_boxed, extract_regex=extract_regex
-                )
-                sample["symbolic_correct"] = sample["predicted_answer"] == sample["expected_answer"]
-                fout.write(json.dumps(sample) + "\n")
+    jsonl_file = eval_config.input_file
+    with open(jsonl_file, "rt", encoding="utf-8") as fin:
+        data = [json.loads(line) for line in fin]
+    with open(jsonl_file, "wt", encoding="utf-8") as fout:
+        for sample in tqdm(data):
+            # Per-sample values override config defaults for backward compatibility
+            extract_from_boxed = sample.get("extract_from_boxed", eval_config.extract_from_boxed)
+            extract_regex = sample.get("extract_regex", eval_config.extract_regex)
+            sample["predicted_answer"] = extract_letter(
+                sample["generation"], extract_from_boxed=extract_from_boxed, extract_regex=extract_regex
+            )
+            sample["symbolic_correct"] = sample["predicted_answer"] == sample["expected_answer"]
+            fout.write(json.dumps(sample) + "\n")
diff --git a/nemo_skills/evaluation/evaluator/mrcr.py b/nemo_skills/evaluation/evaluator/mrcr.py
index 1bf05da0aa..dd7c7ab280 100644
--- a/nemo_skills/evaluation/evaluator/mrcr.py
+++ b/nemo_skills/evaluation/evaluator/mrcr.py
@@ -18,12 +18,15 @@
 
 from tqdm import tqdm
 
-from nemo_skills.utils import get_logger_name, unroll_files
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 def eval_mrcr(cfg):
+    cfg = BaseEvaluatorConfig(**cfg)
+
     def grade(response, answer, random_string_to_prepend) -> float:
         """
         Compare response and answer.
@@ -35,12 +38,12 @@ def grade(response, answer, random_string_to_prepend) -> float:
         answer = answer.removeprefix(random_string_to_prepend)
         return float(SequenceMatcher(None, response, answer).ratio())
 
-    for file in unroll_files(cfg.input_files):
-        with open(file, "rt", encoding="utf-8") as fin:
-            data = [json.loads(line) for line in fin]
-        with open(file, "wt", encoding="utf-8") as fout:
-            for sample in tqdm(data):
-                sample["seq_match_ratio"] = grade(
-                    sample["generation"], sample["expected_answer"], sample["random_string_to_prepend"]
-                )
-                fout.write(json.dumps(sample) + "\n")
+    jsonl_file = cfg.input_file
+    with open(jsonl_file, "rt", encoding="utf-8") as fin:
+        data = [json.loads(line) for line in fin]
+    with open(jsonl_file, "wt", encoding="utf-8") as fout:
+        for sample in tqdm(data):
+            sample["seq_match_ratio"] = grade(
+                sample["generation"], sample["expected_answer"], sample["random_string_to_prepend"]
+            )
+            fout.write(json.dumps(sample) + "\n")
diff --git a/nemo_skills/evaluation/evaluator/ojbench.py b/nemo_skills/evaluation/evaluator/ojbench.py
deleted file mode 100644
index 1ccab2fe0b..0000000000
--- a/nemo_skills/evaluation/evaluator/ojbench.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-import json
-import logging
-import shlex
-import textwrap
-from dataclasses import field
-from pathlib import Path
-
-from nemo_skills.code_execution.sandbox import Sandbox
-from nemo_skills.evaluation.evaluator.code import preprocess_code
-from nemo_skills.evaluation.evaluator.livecodebench import (
-    execute_in_sandbox_with_retries,
-    is_sandbox_available,
-    sandbox_context,
-)
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
-
-LOG = logging.getLogger(get_logger_name(__file__))
-
-
-@nested_dataclass(kw_only=True)
-class OJBenchConfig:
-    sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"})
-    timeout: int = 6
-    timeout_buffer: int = 60
-    num_retries: int = 3
-
-
-async def install_packages(sandbox: Sandbox, eval_config: OJBenchConfig) -> bool:
-    """Helper to install packages inside the sandbox."""
-    LOG.info("Installing required packages for ojbench evaluation...")
-
-    clone_cmd = "git clone https://github.com/He-Ren/OJBench.git"
-    result, _ = await execute_in_sandbox_with_retries(
-        sandbox, eval_config.num_retries, clone_cmd, language="shell", timeout=300
-    )
-    if result["process_status"] != "completed":
-        stderr = result.get("stderr", "Unknown error")
-        LOG.warning(f"Failed to clone OJBench repo: {stderr}")
-        return False
-
-    install_cmd = "pip install -e OJBench"
-    result, _ = await execute_in_sandbox_with_retries(
-        sandbox, eval_config.num_retries, install_cmd, language="shell", timeout=300
-    )
-    if result["process_status"] != "completed":
-        stderr = result.get("stderr", "Unknown error")
-        LOG.warning(f"Failed to install ojbench: {result.get('stderr', 'Unknown error')}")
-        return False
-
-    LOG.info("Successfully installed ojbench.")
-    return True
-
-
-async def eval_ojbench_async(cfg, eval_config: OJBenchConfig):
-    problem_dirs = [
-        Path(cfg.data_dir, "ojbench/OJBench_testdata/NOI"),
-        Path(cfg.data_dir, "ojbench/OJBench_testdata/ICPC"),
-    ]
-
-    async with sandbox_context(eval_config.sandbox) as sandbox:
-        if not await install_packages(sandbox, eval_config):
-            return
-
-        for jsonl_file_str in unroll_files(cfg.input_files):
-            jsonl_file = Path(jsonl_file_str)
-            with open(jsonl_file, encoding="utf-8") as f_in:
-                samples = []
-                for line in f_in:
-                    sample = json.loads(line)
-                    sample = preprocess_code(sample, sample["language"], strip_whitespace=True)
-                    sample["prompt"] = sample.pop("question")
-                    sample["content"] = f"```{sample['language']}\n{sample['completion']}\n```"
-                    sample.pop("completion")
-                    samples.append(sample)
-
-            input_filename = jsonl_file.name.replace("output-", "eval-input-", 1)
-            eval_input_file = jsonl_file.with_name(input_filename)
-            results_filename = jsonl_file.name.replace("output-", "eval-results-", 1)
-            eval_results_file = jsonl_file.with_name(results_filename)
-
-            with open(eval_input_file, "w", encoding="utf-8") as f_out:
-                f_out.writelines(json.dumps(sample) + "\n" for sample in samples)
-
-            eval_code = textwrap.dedent(f"""
-                import ojbench
-                ojbench.init(problem_dirs={repr([str(p) for p in problem_dirs])})
-                ojbench.judge_jsonl(
-                    input_path={repr(str(eval_input_file))},
-                    output_path={repr(str(eval_results_file))},
-                    num_workers=16
-                )
-            """)
-
-            cmd = f'env -i PATH="/usr/local/bin:/usr/bin:/bin" python3 -c {shlex.quote(eval_code)}'
-            output, _ = await execute_in_sandbox_with_retries(
-                sandbox,
-                eval_config.num_retries,
-                cmd,
-                language="shell",
-                timeout=eval_config.timeout * len(samples) + eval_config.timeout_buffer,
-                max_output_characters=100_000,
-            )
-
-            if output.get("process_status") != "completed":
-                raise RuntimeError(f"Evaluation failed for {jsonl_file}. Stderr: {output.get('stderr')}")
-
-            with open(eval_results_file, "rt", encoding="utf-8") as fin:
-                results = [json.loads(line) for line in fin]
-
-            if len(results) != len(samples):
-                LOG.error(f"Result count mismatch for {jsonl_file}: {len(results)} results vs {len(samples)} samples")
-                continue
-
-            for sample, result in zip(samples, results, strict=True):
-                sample["verdict"] = result["verdict"]
-                sample["is_passed"] = result["is_passed"]
-
-            with open(jsonl_file, "w", encoding="utf-8") as f:
-                for sample in samples:
-                    f.write(json.dumps(sample) + "\n")
-
-
-def eval_ojbench(cfg):
-    """Synchronous wrapper to run the async evaluation."""
-    eval_config = OJBenchConfig(**cfg.eval_config)
-    sandbox_is_ready = asyncio.run(is_sandbox_available(eval_config.sandbox))
-    if sandbox_is_ready:
-        asyncio.run(eval_ojbench_async(cfg, eval_config))
-    else:
-        raise RuntimeError("The OJBench evaluation requires a running sandbox, but the service was unreachable.")
diff --git a/nemo_skills/evaluation/evaluator/ruler.py b/nemo_skills/evaluation/evaluator/ruler.py
index 86d99645e6..b43393675a 100644
--- a/nemo_skills/evaluation/evaluator/ruler.py
+++ b/nemo_skills/evaluation/evaluator/ruler.py
@@ -14,17 +14,19 @@
 
 import json
 import logging
+import os
 import re
 
 from tqdm import tqdm
 
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.utils import get_logger_name, nested_dataclass
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 @nested_dataclass(kw_only=True)
-class RulerEvaluatorConfig:
+class RulerEvaluatorConfig(BaseEvaluatorConfig):
     parse_func: str = "default"
     match_type: str
 
@@ -54,7 +56,7 @@ def string_match_part_single(preds, refs):
         ][0]
         return score
 
-    eval_config = RulerEvaluatorConfig(**cfg.eval_config)
+    eval_config = RulerEvaluatorConfig(**cfg)
 
     parse_funcs = {
         "default": default_parse,
@@ -64,14 +66,18 @@ def string_match_part_single(preds, refs):
         "part": string_match_part_single,
     }
 
-    for file in unroll_files(cfg.input_files):
-        with open(file, "rt", encoding="utf-8") as fin:
-            data = [json.loads(line) for line in fin]
-        with open(file, "wt", encoding="utf-8") as fout:
-            for sample in tqdm(data):
-                parse_result = parse_funcs[eval_config.parse_func](sample["generation"])
-                sample["is_correct"] = match_type_funcs[eval_config.match_type](
-                    sample["generation"], sample["expected_answer"]
-                )
-                sample["predicted_answer"] = parse_result
-                fout.write(json.dumps(sample) + "\n")
+    jsonl_file = eval_config.input_file
+    with open(jsonl_file, "rt", encoding="utf-8") as fin:
+        data = [json.loads(line) for line in fin]
+        for sample in tqdm(data):
+            parse_result = parse_funcs[eval_config.parse_func](sample["generation"])
+            sample["is_correct"] = match_type_funcs[eval_config.match_type](
+                sample["generation"], sample["expected_answer"]
+            )
+            sample["predicted_answer"] = parse_result
+
+    with open(jsonl_file + "-tmp", "wt", encoding="utf-8") as fout:
+        for sample in data:
+            fout.write(json.dumps(sample) + "\n")
+
+    os.replace(jsonl_file + "-tmp", jsonl_file)
diff --git a/nemo_skills/evaluation/evaluator/scicode.py b/nemo_skills/evaluation/evaluator/scicode.py
index 2810b416c8..a0819af15e 100644
--- a/nemo_skills/evaluation/evaluator/scicode.py
+++ b/nemo_skills/evaluation/evaluator/scicode.py
@@ -19,14 +19,15 @@
 from dataclasses import field
 
 from nemo_skills.code_execution.sandbox import get_sandbox
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
 from nemo_skills.inference.eval.scicode_utils import eval_prefix
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+from nemo_skills.utils import get_logger_name, nested_dataclass
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
 @nested_dataclass(kw_only=True)
-class ScicodeEvaluatorConfig:
+class ScicodeEvaluatorConfig(BaseEvaluatorConfig):
     sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"})
     timeout: float = 30.0
     num_parallel_requests: int = 20
@@ -99,7 +100,7 @@ async def execute_with_semaphore(task_args):
 
 
 def eval_scicode(cfg):
-    eval_config = ScicodeEvaluatorConfig(**cfg.eval_config)
+    eval_config = ScicodeEvaluatorConfig(**cfg)
 
     # Install required packages for scicode evaluation
     LOG.info("Installing required packages for scicode evaluation...")
@@ -128,11 +129,11 @@ async def install_packages():
 
     asyncio.run(install_packages())
 
-    for file in unroll_files(cfg.input_files):
-        with open(file, "rt", encoding="utf-8") as fin:
-            data = [json.loads(line) for line in fin]
-        status_lists = test_code(eval_config, data)
-        with open(file, "wt", encoding="utf-8") as fout:
-            for idx, elem in enumerate(data):
-                elem["eval_status"] = status_lists[idx]
-                fout.write(json.dumps(elem) + "\n")
+    jsonl_file = eval_config.input_file
+    with open(jsonl_file, "rt", encoding="utf-8") as fin:
+        data = [json.loads(line) for line in fin]
+    status_lists = test_code(eval_config, data)
+    with open(jsonl_file, "wt", encoding="utf-8") as fout:
+        for idx, elem in enumerate(data):
+            elem["eval_status"] = status_lists[idx]
+            fout.write(json.dumps(elem) + "\n")
diff --git a/nemo_skills/evaluation/math_grader.py b/nemo_skills/evaluation/math_grader.py
index 517b795019..75d1e5c5b5 100644
--- a/nemo_skills/evaluation/math_grader.py
+++ b/nemo_skills/evaluation/math_grader.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import glob
 import logging
 import re
 
@@ -24,12 +23,6 @@
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
-def unroll_files(input_files):
-    for manifest_pattern in input_files:
-        for manifest in sorted(glob.glob(manifest_pattern, recursive=True)):
-            yield manifest
-
-
 def _additional_normalization(expr):
     # Remove % and \\% from the number
     percentage_pattern = r"^(\d+\.?\d*)(?:\\%|%)$"
@@ -99,43 +92,6 @@ def math_equal(gt_answer, predicted_answer, take_modulo: int | None = None, **kw
     return verify(parsed_gt, parsed_pred, **kwargs)
 
 
-def evaluate_result(
-    line_dict: dict,
-    numeric_precision=15,
-    timeout=10,
-    take_modulo=None,
-    use_predicted_answer_key: bool = False,
-    extract_from_boxed: bool = True,
-    extract_regex: str = r"The final answer is (.+)$",
-):
-    if not line_dict:  # can be empty for incomplete generations
-        return {}
-
-    if not use_predicted_answer_key:
-        line_dict["predicted_answer"] = extract_answer(
-            line_dict["generation"],
-            extract_from_boxed=extract_from_boxed,
-            extract_regex=extract_regex,
-        )
-    else:
-        if "predicted_answer" not in line_dict:
-            raise ValueError(
-                "predicted_answer key not found in the line_dict. Set use_predicted_answer_key=False to re-extract"
-            )
-
-    gt_answer = line_dict["expected_answer"]
-    predicted_answer = line_dict["predicted_answer"]
-
-    line_dict["symbolic_correct"] = math_equal(
-        gt_answer,
-        predicted_answer,
-        take_modulo=take_modulo,
-        numeric_precision=numeric_precision,
-        timeout_seconds=timeout,
-    )
-    return line_dict
-
-
 def extract_answer(string: str, extract_from_boxed: bool = True, extract_regex: str = r"The final answer is (.+)$"):
     """Extract Answer String from \\boxed expression or based on regex"""
     if not extract_from_boxed:
diff --git a/nemo_skills/evaluation/metrics/code_metrics.py b/nemo_skills/evaluation/metrics/code_metrics.py
index f5cc690a1c..8274efd77b 100644
--- a/nemo_skills/evaluation/metrics/code_metrics.py
+++ b/nemo_skills/evaluation/metrics/code_metrics.py
@@ -111,18 +111,6 @@ def update(self, predictions):
         self._compute_pass_at_k(predictions=predictions)
 
 
-class OJBenchMetrics(BaseMetrics):
-    def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
-        return {"accuracy": prediction["is_passed"]}
-
-    def get_incorrect_sample(self, prediction: dict) -> dict:
-        return {"is_passed": False}
-
-    def update(self, predictions):
-        super().update(predictions)
-        self._compute_pass_at_k(predictions=predictions)
-
-
 class HumanEvalInfillingMetrics(BaseMetrics):
     def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         return {"accuracy": prediction["passed"]}
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
index ae2433da15..5349559848 100644
--- a/nemo_skills/evaluation/metrics/map_metrics.py
+++ b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -25,7 +25,6 @@
     EvalPlusMetrics,
     HumanEvalInfillingMetrics,
     LiveCodeBenchMetrics,
-    OJBenchMetrics,
     SciCodeMetrics,
     SweBenchMetrics,
 )
@@ -59,7 +58,6 @@
     "mrcr": MRCRMetrics,
     "aalcr": AALCRMetrics,
     "livebench_coding": LiveCodeBenchMetrics,
-    "ojbench": OJBenchMetrics,
     "translation": TranslationMetrics,
     "human_eval_infilling": HumanEvalInfillingMetrics,
 }
diff --git a/nemo_skills/inference/eval/bfcl.py b/nemo_skills/inference/eval/bfcl.py
index 58baf511c8..c2507a5ec4 100644
--- a/nemo_skills/inference/eval/bfcl.py
+++ b/nemo_skills/inference/eval/bfcl.py
@@ -60,7 +60,6 @@ class BFCLGenerationConfig(GenerateSolutionsConfig):
     # Inference server configuration {server_params}
     server: dict = field(default_factory=dict)
 
-    remove_thinking: bool = True
     use_client_parsing: bool = True
     model_name: str | None = None
 
@@ -384,8 +383,9 @@ async def _generate_single_data_point_multi_turn(self, data_point):
                 if self.cfg.count_prompt_tokens:
                     output_dict["num_input_tokens_list"].append(model_response.get("num_input_tokens", 0))
 
-                if self.cfg.remove_thinking:
-                    trimmed_response_text = self._remove_thinking_from_message_content(
+                if self.cfg.parse_reasoning:
+                    # TODO: replace with main parse_reasoning method
+                    trimmed_response_text = self._parse_reasoning_from_message_content(
                         self.message_parser.get_response_text(model_response["message"])
                     )
                     self.message_parser.set_response_text(model_response["message"], trimmed_response_text)
@@ -450,13 +450,13 @@ async def _generate_single_data_point_multi_turn(self, data_point):
 
         return output_dict
 
-    def _remove_thinking_from_message_content(self, model_response_text: str | None):
+    def _parse_reasoning_from_message_content(self, model_response_text: str | None):
         """If specified, remove the thinking part of the model response text."""
         if model_response_text is None:
             return None
 
-        if self.cfg.thinking_end in model_response_text:
-            return model_response_text.split(self.cfg.thinking_end)[-1].lstrip("\n")
+        if self.cfg.end_reasoning_string in model_response_text:
+            return model_response_text.split(self.cfg.end_reasoning_string)[-1].lstrip("\n")
         else:
             # If the thinking didn't finish, we can keep it empty
             return ""
diff --git a/nemo_skills/inference/eval/scicode.py b/nemo_skills/inference/eval/scicode.py
index 343b156a01..b7ed8ee0db 100644
--- a/nemo_skills/inference/eval/scicode.py
+++ b/nemo_skills/inference/eval/scicode.py
@@ -18,11 +18,25 @@
 
 import hydra
 
-from nemo_skills.inference.eval.scicode_utils import extract_python_script, prefilled_steps_code, process_problem_steps
-from nemo_skills.inference.generate import GenerateSolutionsConfig, GenerationTask, InferenceConfig
+from nemo_skills.inference.eval.scicode_utils import (
+    extract_python_script,
+    prefilled_steps_code,
+    process_problem_steps,
+)
+from nemo_skills.inference.generate import (
+    GenerateSolutionsConfig,
+    GenerationTask,
+    InferenceConfig,
+)
 from nemo_skills.inference.model import server_params
 from nemo_skills.inference.model.utils import is_context_window_exceeded_error
-from nemo_skills.utils import get_help_message, get_logger_name, nested_dataclass, remove_thinking, setup_logging
+from nemo_skills.utils import (
+    get_help_message,
+    get_logger_name,
+    nested_dataclass,
+    parse_reasoning,
+    setup_logging,
+)
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -41,8 +55,6 @@ class SciCodeGenerationConfig(GenerateSolutionsConfig):
     prompt_config: str = "eval/scicode/background"
     with_background: bool = True
 
-    remove_thinking: bool = True  # changing default
-
 
 cs = hydra.core.config_store.ConfigStore.instance()
 cs.store(name="base_scicode_generation_config", node=SciCodeGenerationConfig)
@@ -103,8 +115,8 @@ async def process_single_datapoint(self, data_point, all_data):
 
             full_outputs[f"{problem_id}.{cur_step + 1}"] = llm_output
             total_generated_tokens += llm_output.get("num_generated_tokens", 0)
-            if self.cfg.remove_thinking:
-                remove_thinking(llm_output, "generation", self.cfg.thinking_begin, self.cfg.thinking_end)
+            if self.cfg.parse_reasoning:
+                parse_reasoning(llm_output, "generation", self.cfg.end_reasoning_string)
             extracted_python = extract_python_script(llm_output["generation"])
             previous_llm_code[cur_step] = extracted_python
             # TODO: save those as separate entries so that we can preserve intermediate progress on reruns
diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py
index 1363ba205f..431d7ebd8c 100644
--- a/nemo_skills/inference/eval/swebench.py
+++ b/nemo_skills/inference/eval/swebench.py
@@ -30,7 +30,12 @@
 from nemo_skills.inference.generate import GenerationTask
 from nemo_skills.inference.model import server_params
 from nemo_skills.prompt.utils import get_config_path
-from nemo_skills.utils import get_help_message, get_logger_name, nested_dataclass, setup_logging
+from nemo_skills.utils import (
+    get_help_message,
+    get_logger_name,
+    nested_dataclass,
+    setup_logging,
+)
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -139,9 +144,12 @@ class SweBenchGenerationConfig:
     dry_run: bool = False
 
     # if True, will move full generation to _full_generation key and keep cfg.generation_key without thinking tokens
-    remove_thinking: bool = False
-    thinking_begin: str = "<think>"
-    thinking_end: str = "</think>"
+    parse_reasoning: bool = False
+    end_reasoning_string: str = "</think>"
+
+    # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
+    eval_type: str | None = None  # "lean4-proof", "math", etc.
+    eval_config: dict = field(default_factory=dict)  # Config for the evaluator
 
 
 cs = hydra.core.config_store.ConfigStore.instance()
@@ -165,6 +173,14 @@ def __init__(self, cfg: SweBenchGenerationConfig):
         # needs to skip completed samples, not used otherwise
         self.cfg.prompt_format = "ns"
 
+        if self.cfg.eval_type is not None:
+            raise ValueError(
+                "SWE-bench generation task does not support eval_type parameter. Evaluation is done automatically."
+            )
+
+        self.should_run_evaluation = False
+        self.evaluator = None
+
     def log_example_prompt(self, data):
         return
 
@@ -180,7 +196,7 @@ def setup_litellm_cache(self):
     def cleanup_litellm_cache(self):
         return
 
-    async def apply_evaluation_hook(self, data_point):
+    async def evaluate_single_datapoint(self, data_point):
         # currently evaluation is done directly after generation already
         return data_point
 
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 4e9d655aaf..d2d4ca3e9f 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -32,6 +32,11 @@
 from transformers import AutoTokenizer
 
 from nemo_skills.code_execution.sandbox import get_sandbox, sandbox_params
+from nemo_skills.evaluation.evaluator import (
+    evaluate,
+    get_evaluator_class,
+    supports_single_eval,
+)
 from nemo_skills.inference.model import (
     ParallelThinkingConfig,
     get_code_execution_model,
@@ -48,7 +53,7 @@
     get_logger_name,
     get_server_wait_cmd,
     nested_dataclass,
-    remove_thinking,
+    parse_reasoning,
     setup_logging,
 )
 
@@ -79,7 +84,7 @@ class InferenceConfig:
 
 @nested_dataclass(kw_only=True)
 class GenerateSolutionsConfig:
-    """LLM generation parameters."""
+    """Generation parameters."""
 
     input_file: str  # Path to the input file with data
     output_file: str  # Where to save the generations
@@ -172,14 +177,14 @@ class GenerateSolutionsConfig:
     tool_overrides: dict | None = field(default_factory=dict)
 
     # if True, will move full generation to _full_generation key and keep cfg.generation_key without thinking tokens
-    remove_thinking: bool = False
-    thinking_begin: str = "<think>"
-    thinking_end: str = "</think>"
+    # IMPORTANT: do not set this for non-reasoning models as it will make the generations empty!
+    parse_reasoning: bool = False
+    end_reasoning_string: str = "</think>"
 
     # If True, will enable litellm disk cache (useful for keeping intermediate results in case of job timelimit failures)
     enable_litellm_cache: bool = False
 
-    # Evaluation during generation
+    # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
     eval_type: str | None = None  # "lean4-proof", "math", etc.
     eval_config: dict = field(default_factory=dict)  # Config for the evaluator
 
@@ -329,20 +334,13 @@ def __init__(self, cfg: GenerateSolutionsConfig):
             self.extra_generate_params = {}
 
         # Setup evaluator if specified
+        self.should_run_evaluation = self.cfg.eval_type is not None
         self.evaluator = None
-        if self.cfg.eval_type:
-            from nemo_skills.evaluation.evaluator import (
-                get_evaluator_class,
-                supports_single_eval,
-            )
-
-            if not supports_single_eval(self.cfg.eval_type, self.cfg.eval_config):
-                raise ValueError(
-                    f"Evaluator '{self.cfg.eval_type}' does not support single evaluation during generation. "
-                    f"Use the evaluation pipeline instead."
-                )
-
-            self.evaluator = get_evaluator_class(self.cfg.eval_type, self.cfg.eval_config)
+        if self.should_run_evaluation:
+            self.cfg.eval_config = dict(self.cfg.eval_config)
+            if supports_single_eval(self.cfg.eval_type, self.cfg.eval_config):
+                LOG.info("Evaluator supports per-datapoint evals, will interleave evaluation with generation.")
+                self.evaluator = get_evaluator_class(self.cfg.eval_type, self.cfg.eval_config)
 
         LOG.info(
             "Async loop is maintaining %d generations in parallel. "
@@ -399,9 +397,9 @@ def setup_llm(self):
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg
             inference_override_config = {
-                "remove_thinking": self.cfg.parallel_thinking.remove_thinking,  # Removing thinking from solutions is important for parallel_thinking. We don't want to override this with the main generation config
                 "endpoint_type": self.cfg.parallel_thinking.endpoint_type,
-                # The following are specific to parallel thinking and we want to defend against any future key overlaps with the main generation config
+                # The following are specific to parallel thinking and we want
+                # to defend against any future key overlaps with the main generation config
                 "mode": self.cfg.parallel_thinking.mode,
                 "window_size": self.cfg.parallel_thinking.window_size,
                 "solution_key": self.cfg.parallel_thinking.solution_key,
@@ -454,6 +452,11 @@ def postprocess(self):
         """
         pass
 
+    def run_batch_evaluation(self):
+        """Run final evaluation consuming all data together if configured."""
+        self.cfg.eval_config["input_file"] = self.cfg.output_file
+        evaluate(self.cfg.eval_type, self.cfg.eval_config)
+
     def skip_completed_samples(self, data):
         # if non-async file exists and we are asked to skip filled, then there is no more data to process
         if self.cfg.skip_filled and Path(self.cfg.output_file).exists():
@@ -519,25 +522,31 @@ def fill_prompt(self, data_point, data):
         return filled_prompt
 
     def dump_outputs(self, outputs, data_points, fout):
-        for output, original_data_point in zip(outputs, data_points):
-            # to make it easier to follow up with evaluation and limit accidental errors, we are adding
-            # all of the ground-truth data to the output file alongside the generated solutions
-            output[self.cfg.generation_key] = output.pop("generation")
-
-            if not self.cfg.add_generation_stats:
-                output.pop("generation_start_time", None)
-                output.pop("generation_end_time", None)
-                output.pop("generation_time", None)
-                output.pop("num_generated_tokens", None)
-                output.pop("num_input_tokens", None)
-
-            for key in output:
-                original_data_point.pop(key, None)
-            output.update(original_data_point)
-            if self.cfg.remove_thinking:
-                remove_thinking(output, self.cfg.generation_key, self.cfg.thinking_begin, self.cfg.thinking_end)
+        for output in outputs:
             fout.write(json.dumps(output) + "\n")
 
+    async def postprocess_single_output(self, output, original_data_point):
+        # to make it easier to follow up with other generations and limit accidental errors, we are adding
+        # all of the original data to the output file alongside the new generations
+        output[self.cfg.generation_key] = output.pop("generation")
+
+        if not self.cfg.add_generation_stats:
+            output.pop("generation_start_time", None)
+            output.pop("generation_end_time", None)
+            output.pop("generation_time", None)
+            output.pop("num_generated_tokens", None)
+            output.pop("num_input_tokens", None)
+
+        for key in output:
+            original_data_point.pop(key, None)
+        output.update(original_data_point)
+        if self.cfg.parse_reasoning:
+            parse_reasoning(
+                output,
+                self.cfg.generation_key,
+                self.cfg.end_reasoning_string,
+            )
+
     def prefill_generation(self, data_point) -> dict | None:
         """Prefill generation in case LLM is not required."""
         # Override this method to customize the prefilling behavior.
@@ -580,13 +589,12 @@ async def generate_with_semaphore(self, **generation_params):
         async with self.semaphore:
             return await self.llm.generate_async(**generation_params)
 
-    async def apply_evaluation_hook(self, data_point):
-        if self.evaluator:
-            eval_start_time = time.time()
-            eval_results = await self.evaluator.eval_single(data_point)
-            eval_end_time = time.time()
-            data_point["interleaved_eval_single_time_s"] = eval_end_time - eval_start_time
-            data_point.update(eval_results)
+    async def evaluate_single_datapoint(self, data_point):
+        eval_start_time = time.time()
+        eval_results = await self.evaluator.eval_single(data_point)
+        eval_end_time = time.time()
+        data_point["interleaved_eval_single_time_s"] = eval_end_time - eval_start_time
+        data_point.update(eval_results)
         return data_point
 
     async def _generate_and_save_datapoint(self, data_point, all_data, fout, pbar):
@@ -601,12 +609,11 @@ async def _generate_and_save_datapoint(self, data_point, all_data, fout, pbar):
             output["generation_end_time"] = end_time
             output["generation_time"] = end_time - start_time
 
-        # Apply evaluation hook if configured
-        # TODO: note that this currently only evaluates independently--if there
-        # is any post-processing that needs to be done on the full set of
-        # generations, this will not work correctly, and we might need another
-        # hook at the end of generation to make it work properly
-        output = await self.apply_evaluation_hook({**data_point, **output})
+        await self.postprocess_single_output(output, data_point)
+
+        # evaluate single-data point if requested and evaluator supports that
+        if self.should_run_evaluation and self.evaluator:
+            output = await self.evaluate_single_datapoint({**data_point, **output})
 
         # Thread-safe output writing
         async with self.output_lock:
@@ -637,6 +644,12 @@ async def async_loop(self, data):
         with open(self.cfg.output_file + "-async", "at", encoding="utf-8", buffering=1) as fout:
             # Dump prefilled data first
             if len(prefilled_data_points) > 0:
+                for output, data_point in zip(prefilled_outputs, prefilled_data_points):
+                    await self.postprocess_single_output(output, data_point)
+
+                    # evaluate single-data point if requested and evaluator supports that
+                    if self.should_run_evaluation and self.evaluator:
+                        output = await self.evaluate_single_datapoint({**data_point, **output})
                 async with self.output_lock:
                     self.dump_outputs(prefilled_outputs, prefilled_data_points, fout)
 
@@ -704,25 +717,26 @@ def generate(self):
         data = self.skip_completed_samples(data)
 
         if len(data) == 0:
-            LOG.info("No data to process, exiting.")
-            return
-
-        data = self.preprocess_data(data)
+            LOG.info("No data to process, skipping generation")
+        else:
+            data = self.preprocess_data(data)
 
-        self.log_example_prompt(data)
+            self.log_example_prompt(data)
 
-        if self.cfg.dry_run:
-            LOG.info("Exiting without running generation as dry_run flag is set.")
-            return
+            if self.cfg.dry_run:
+                LOG.info("Exiting without running generation as dry_run flag is set.")
+                return
 
-        if not self.cfg.skip_filled:
-            for output_path in [Path(self.cfg.output_file), Path(self.cfg.output_file + "-async")]:
-                if output_path.exists():
-                    output_path.unlink()
+            if not self.cfg.skip_filled:
+                for output_path in [Path(self.cfg.output_file), Path(self.cfg.output_file + "-async")]:
+                    if output_path.exists():
+                        output_path.unlink()
 
-        self.wait_for_server()
-        asyncio.run(self.async_loop(data))
+            self.wait_for_server()
+            asyncio.run(self.async_loop(data))
 
+        if self.should_run_evaluation and self.evaluator is None:
+            self.run_batch_evaluation()
         self.postprocess()
 
 
diff --git a/nemo_skills/inference/model/parallel_thinking.py b/nemo_skills/inference/model/parallel_thinking.py
index 2c68467fef..7c96bd57d4 100644
--- a/nemo_skills/inference/model/parallel_thinking.py
+++ b/nemo_skills/inference/model/parallel_thinking.py
@@ -27,7 +27,7 @@
 from transformers import AutoTokenizer
 
 from nemo_skills.prompt.utils import get_prompt, get_token_count
-from nemo_skills.utils import get_logger_name, nested_dataclass, remove_thinking
+from nemo_skills.utils import get_logger_name, nested_dataclass, parse_reasoning
 
 from .base import BaseModel, EndpointType
 
@@ -51,9 +51,8 @@ class ParallelThinkingConfig:
     temperature: float = 0.6
     tokens_to_generate: int | None = None
 
-    remove_thinking: bool = True  # Remove thinking tokens from the solution key
-    thinking_begin: str = "<think>"
-    thinking_end: str = "</think>"
+    parse_reasoning: bool = False
+    end_reasoning_string: str = "</think>"
     endpoint_type: EndpointType = EndpointType.chat
     tokenizer: str | None = None
     chat_template_kwargs: dict = field(default_factory=dict)
@@ -151,12 +150,11 @@ async def generate_solutions(
         generation_results = await asyncio.gather(*tasks)
         solutions = []
         for generation_result in generation_results:
-            if self.cfg.remove_thinking:
-                remove_thinking(
+            if self.cfg.parse_reasoning:
+                parse_reasoning(
                     generation_result,
                     generation_key=self.cfg.solution_key,
-                    thinking_begin=self.cfg.thinking_begin,
-                    thinking_end=self.cfg.thinking_end,
+                    end_reasoning_string=self.cfg.end_reasoning_string,
                 )
 
             if self.cfg.solution_length_cap is not None:
@@ -194,12 +192,11 @@ def _load_solutions(self, input_dir: str) -> Dict[str, List[Dict]]:
             with open(input_file, "r") as f:
                 for line in f:
                     data_point = json.loads(line)
-                    if self.cfg.remove_thinking:
-                        remove_thinking(
+                    if self.cfg.parse_reasoning:
+                        parse_reasoning(
                             data_point,
                             generation_key=self.cfg.solution_key,
-                            thinking_begin=self.cfg.thinking_begin,
-                            thinking_end=self.cfg.thinking_end,
+                            end_reasoning_string=self.cfg.end_reasoning_string,
                         )
 
                     if self.cfg.solution_length_cap is not None:
@@ -241,13 +238,7 @@ async def _get_multiple_solutions(
             # Remove unfinished solutions
             filtered_solutions = []
             for solution in solutions:
-                # Check if thinking_begin is in the solution and thinking_end is not in the solution
-                if (
-                    self.cfg.thinking_begin in solution[self.cfg.solution_key]
-                    and self.cfg.thinking_end not in solution[self.cfg.solution_key]
-                ):
-                    continue
-                elif solution[self.cfg.solution_key] == "":
+                if solution[self.cfg.solution_key] == "":
                     LOG.warning("Solution is empty, skipping")
                     continue
                 else:
diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
index 500fc7481f..1977e64581 100644
--- a/nemo_skills/pipeline/eval.py
+++ b/nemo_skills/pipeline/eval.py
@@ -132,7 +132,6 @@ def eval(
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
     mount_paths: str = typer.Option(None, help="Comma separated list of paths to mount on the remote machine"),
-    extra_eval_args: str = typer.Option("", help="Additional arguments for evaluation"),
     auto_summarize_results: bool = typer.Option(
         True, help="If True, will automatically launch summarize results tasks"
     ),
@@ -318,7 +317,6 @@ def eval(
         with_sandbox,
         keep_mounts_for_sandbox,
         wandb_parameters,
-        extra_eval_args,
         eval_requires_judge=eval_requires_judge,
         generation_type=generation_type,
         generation_module=generation_module,
@@ -425,6 +423,7 @@ def eval(
                 expname=f"{expname}-{benchmark}-judge",
                 log_dir=log_dir + "/judge",
                 cluster=cluster,
+                config_dir=config_dir,
                 partition=partition,
                 qos=qos,
                 time_min=time_min,
diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
index b4552da18d..8735126aa9 100644
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
@@ -218,9 +218,6 @@ def generate(
     ),
     qos: str = typer.Option(None, help="Specify Slurm QoS, e.g. to request interactive nodes"),
     time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"),
-    eval_args: str = typer.Option(
-        None, help="Specify if need to run nemo_skills/evaluation/evaluate_results.py on the generation outputs"
-    ),
     run_after: List[str] = typer.Option(
         None, help="Can specify a list of expnames that need to be completed before this one starts"
     ),
@@ -408,7 +405,6 @@ def generate(
                 random_seed=seed,
                 output_dir=output_dir,
                 extra_arguments=extra_arguments,
-                eval_args=eval_args,
                 chunk_id=chunk_id,
                 num_chunks=num_chunks,
                 preprocess_cmd=preprocess_cmd,
@@ -492,8 +488,11 @@ def generate(
         skip_hf_home_check=skip_hf_home_check,
     )
 
+    # TODO: remove after https://github.com/NVIDIA-NeMo/Skills/issues/578 is resolved as default will be single job
+    sequential = True if cluster_config["executor"] in ["local", "none"] else False
+
     # Pass _reuse_exp to pipeline.run() to add jobs to existing experiment
-    result = pipeline.run(dry_run=dry_run, _reuse_exp=_reuse_exp)
+    result = pipeline.run(dry_run=dry_run, _reuse_exp=_reuse_exp, sequential=sequential)
     return result
 
 
diff --git a/nemo_skills/pipeline/prepare_data.py b/nemo_skills/pipeline/prepare_data.py
index 10eae1d3e7..e0a417620f 100644
--- a/nemo_skills/pipeline/prepare_data.py
+++ b/nemo_skills/pipeline/prepare_data.py
@@ -27,7 +27,7 @@
 
 
 # TODO: read this from init.py
-DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "ojbench"]
+DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24"]
 
 
 @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
index e4d339a77d..dbbaf2d6c0 100644
--- a/nemo_skills/pipeline/utils/declarative.py
+++ b/nemo_skills/pipeline/utils/declarative.py
@@ -340,18 +340,14 @@ def _validate(self):
             if not is_mounted_filepath(self.cluster_config, env_vars["HF_HOME"]):
                 raise RuntimeError(f"Invalid cluster_config: HF_HOME={env_vars['HF_HOME']} is not a mounted path.")
 
-    def run(
-        self,
-        dry_run: bool = False,
-        log_dir: Optional[str] = None,
-        _reuse_exp=None,
-    ):
+    def run(self, dry_run: bool = False, log_dir: Optional[str] = None, _reuse_exp=None, sequential: bool = False):
         """Execute the pipeline by calling NeMo-Run directly.
 
         Args:
             dry_run: If True, validate without executing
             log_dir: Default log directory for groups that don't specify one (optional)
             _reuse_exp: Internal - reuse existing experiment object (for eval.py integration)
+            sequential: If True, run tasks sequentially (only makes sense for local/none executors)
         """
         # Track job name -> task handle for dependency resolution
         job_name_to_handle = {}
@@ -467,7 +463,7 @@ def run(
 
             # Only run if not using existing experiment (matching generate_v0.py line 331)
             if not dry_run and not _reuse_exp:
-                run_exp(exp, self.cluster_config)
+                run_exp(exp, self.cluster_config, sequential=sequential)
 
                 # Cache experiment for code reuse in future runs
                 if self.cluster_config["executor"] != "none":
diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py
index 1b0bc86216..936dcc998f 100644
--- a/nemo_skills/pipeline/utils/eval.py
+++ b/nemo_skills/pipeline/utils/eval.py
@@ -15,8 +15,6 @@
 import importlib
 import logging
 import os
-import re
-import shlex
 from copy import deepcopy
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -24,7 +22,6 @@
 
 import nemo_skills.pipeline.utils as pipeline_utils
 from nemo_skills.dataset.utils import get_dataset_module, import_from_path
-from nemo_skills.evaluation.evaluator import supports_single_eval
 from nemo_skills.inference import GENERATION_MODULE_MAP
 from nemo_skills.inference.generate import GenerationTask
 from nemo_skills.utils import compute_chunk_ids, get_logger_name
@@ -32,65 +29,11 @@
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
-def parse_eval_args(eval_args: str) -> tuple[str | None, dict]:
-    # TODO we ideally don't want to rely on custom parsing of the command, but
-    # some major refactoring or clever ideas might be needed
-    """Parse eval_args string to extract eval_type and eval_config.
-
-    Handles Hydra argument formats:
-    - ++eval_type=value (override)
-    - +eval_type=value (new)
-    - eval_type=value (config)
-    """
-    if not eval_args:
-        return None, {}
-
-    eval_type = None
-    eval_config = {}
-
-    # Parse eval_args to extract eval_type and eval_config
-    eval_arg_parts = shlex.split(eval_args)
-    for part in eval_arg_parts:
-        # Match eval_type with any Hydra prefix
-        eval_type_match = re.match(r"^(\+{0,2})eval_type=(.+)$", part)
-        if eval_type_match:
-            eval_type = eval_type_match.group(2)
-            continue
-
-        # Match eval_config with any Hydra prefix
-        eval_config_match = re.match(r"^(\+{0,2})eval_config\.(.+)$", part)
-        if eval_config_match:
-            config_part = eval_config_match.group(2)
-            if "=" in config_part:
-                key, value = config_part.split("=", 1)
-                # Handle nested keys like sandbox.timeout
-                if "." in key:
-                    main_key, sub_key = key.split(".", 1)
-                    if main_key not in eval_config:
-                        eval_config[main_key] = {}
-                    eval_config[main_key][sub_key] = value
-                else:
-                    eval_config[key] = value
-
-    return eval_type, eval_config
-
-
-def should_use_single_eval(eval_args: str) -> bool:
-    """Determine if evaluation should be done during generation (single) vs after (batch)."""
-    eval_type, eval_config = parse_eval_args(eval_args)
-
-    if not eval_type:
-        return False
-
-    return supports_single_eval(eval_type, eval_config)
-
-
 @dataclass
 class BenchmarkArgs:
     name: str
     input_file: str
     generation_args: str
-    eval_args: str
     judge_args: str
     judge_pipeline_args: dict
     requires_sandbox: bool
@@ -183,6 +126,11 @@ def get_benchmark_args_from_module(
     generation_args = get_arg_from_module_or_dict(benchmark_module, "GENERATION_ARGS", "", override_dict=override_dict)
     if prompt_config:
         generation_args = f"++prompt_config={prompt_config} {generation_args}"
+    # this is deprecated, should remove in the future
+    eval_args = get_arg_from_module_or_dict(benchmark_module, "EVAL_ARGS", "", override_dict=override_dict)
+    if eval_args:
+        generation_args = f"{eval_args} {generation_args}"
+    generation_args += f" ++eval_config.split={split} "
     requires_sandbox = get_arg_from_module_or_dict(benchmark_module, "REQUIRES_SANDBOX", False, override_dict)
     keep_mounts_for_sandbox = get_arg_from_module_or_dict(
         benchmark_module, "KEEP_MOUNTS_FOR_SANDBOX", False, override_dict
@@ -200,7 +148,6 @@ def get_benchmark_args_from_module(
         get_arg_from_module_or_dict(benchmark_module, "JUDGE_PIPELINE_ARGS", {}, override_dict)
     )
     judge_args = get_arg_from_module_or_dict(benchmark_module, "JUDGE_ARGS", "", override_dict)
-    eval_args = get_arg_from_module_or_dict(benchmark_module, "EVAL_ARGS", override_dict=override_dict)
     num_samples = get_arg_from_module_or_dict(benchmark_module, "NUM_SAMPLES", 0, override_dict)
     num_chunks = get_arg_from_module_or_dict(benchmark_module, "NUM_CHUNKS", 0, override_dict)
     if num_chunks == 0:
@@ -218,17 +165,15 @@ def get_benchmark_args_from_module(
 
     # when running locally swe-bench launches apptainer inside docker and this required elevated privileges
     # TODO: is there a better way to handle this?
+    # TODO: handle properly without polluting environment for future calls
     if benchmark == "swe-bench" and cluster_config["executor"] == "local":
         LOG.info("Swe-bench requires extra docker privileges, setting NEMO_SKILLS_PRIVILEGED_DOCKER=1")
         os.environ["NEMO_SKILLS_PRIVILEGED_DOCKER"] = "1"
 
-    eval_args += f" ++split={split} "
-
     return BenchmarkArgs(
         name=benchmark,
         input_file=input_file,
         generation_args=generation_args,
-        eval_args=eval_args,
         judge_args=judge_args,
         judge_pipeline_args=judge_pipeline_args,
         requires_sandbox=requires_sandbox,
@@ -275,7 +220,7 @@ def add_default_args(
                 override_dict=override_dict,
             )
             if data_dir:
-                benchmark_args.eval_args += f" ++data_dir={data_dir} "
+                benchmark_args.generation_args += f" ++eval_config.data_dir={data_dir} "
 
             # TODO: should it be optional?
             benchmark_args.score_module = benchmark_or_group_module.SCORE_MODULE
@@ -295,7 +240,7 @@ def add_default_args(
     )
 
     if data_dir:
-        benchmark_args.eval_args += f" ++data_dir={data_dir} "
+        benchmark_args.generation_args += f" ++eval_config.data_dir={data_dir} "
 
     return [benchmark_args]
 
@@ -319,7 +264,6 @@ def prepare_eval_commands(
     with_sandbox,
     keep_mounts_for_sandbox,
     wandb_parameters,
-    extra_eval_args,
     eval_requires_judge,
     generation_type=None,
     generation_module=None,
@@ -479,46 +423,18 @@ def prepare_eval_commands(
                         f"Class {generation_task} overrides get_server_command_fn, "
                         "which is not supported for evaluation when grouping jobs."
                     )
-                # Determine evaluation strategy
-                combined_eval_args = f"{benchmark_args.eval_args} {extra_eval_args}".strip()
-
-                if should_use_single_eval(combined_eval_args):
-                    # Add evaluation to generation arguments (single eval)
-                    eval_type, eval_config = parse_eval_args(combined_eval_args)
-                    eval_extra_args = f" ++eval_type={eval_type} "
-
-                    # Add eval_config parameters
-                    for key, value in eval_config.items():
-                        if isinstance(value, dict):
-                            for nested_key, nested_value in value.items():
-                                eval_extra_args += f" ++eval_config.{key}.{nested_key}={nested_value} "
-                        else:
-                            eval_extra_args += f" ++eval_config.{key}={value} "
-
-                    full_extra_arguments = (
-                        f"{generation_task.get_generation_default_args()} "
-                        f"{benchmark_args.generation_args} "
-                        f"{job_extra_arguments} "
-                        f"{eval_extra_args} "
-                    )
 
-                    # No separate eval command
-                    eval_args_for_cmd = None
-                else:
-                    # Use batch evaluation (separate command)
-                    full_extra_arguments = (
-                        f"{generation_task.get_generation_default_args()} "
-                        f"{benchmark_args.generation_args} "
-                        f"{job_extra_arguments} "
-                    )
-                    eval_args_for_cmd = combined_eval_args
+                full_extra_arguments = (
+                    f"{generation_task.get_generation_default_args()} "
+                    f"{benchmark_args.generation_args} "
+                    f"{job_extra_arguments} "
+                )
 
                 cmd = pipeline_utils.get_generation_cmd(
                     input_file=benchmark_args.input_file,
                     output_dir=benchmark_output_dir,
                     extra_arguments=full_extra_arguments,
                     random_seed=seed,
-                    eval_args=eval_args_for_cmd,
                     chunk_id=chunk_id,
                     num_chunks=benchmark_args.num_chunks,
                     script=generation_module or benchmark_args.generation_module,
diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py
index 60c92174a8..74921ba20a 100644
--- a/nemo_skills/pipeline/utils/generation.py
+++ b/nemo_skills/pipeline/utils/generation.py
@@ -181,7 +181,6 @@ def get_generation_cmd(
     input_dir=None,
     extra_arguments="",
     random_seed=None,
-    eval_args=None,
     chunk_id=None,
     num_chunks=None,
     preprocess_cmd=None,
@@ -265,12 +264,8 @@ def get_generation_cmd(
         else:
             postprocess_cmd = job_end_cmd
 
-    cmd += f" {extra_arguments} "
-
-    if eval_args:
-        cmd += (
-            f" && python -m nemo_skills.evaluation.evaluate_results     ++input_files={output_file}     {eval_args} "
-        )
+    if extra_arguments:
+        cmd += f" {extra_arguments} "
 
     return wrap_cmd(
         cmd=cmd,
diff --git a/nemo_skills/training/data_preparation_utils/preprocessing.py b/nemo_skills/training/data_preparation_utils/preprocessing.py
index 12fa62d453..c2ea9245ea 100644
--- a/nemo_skills/training/data_preparation_utils/preprocessing.py
+++ b/nemo_skills/training/data_preparation_utils/preprocessing.py
@@ -485,7 +485,7 @@ def process(self):
 
                 if self.prompt:
                     output_sample["input"] = self.prompt.fill(
-                        input_dict=elem, chat_template_kwargs=self.chat_template_kwargs
+                        input_dict=elem, chat_template_kwargs=self.chat_template_kwargs, format_as_string=True
                     )
                 else:
                     output_sample["input"] = elem[self.input_key]
diff --git a/nemo_skills/training/nemo_rl/start_grpo.py b/nemo_skills/training/nemo_rl/start_grpo.py
index f1a228d21f..afe7d3f7be 100644
--- a/nemo_skills/training/nemo_rl/start_grpo.py
+++ b/nemo_skills/training/nemo_rl/start_grpo.py
@@ -164,7 +164,7 @@ def ns_data_processor(
     # we need to include system message here as roles are only used for masking
     # so prompt.fill can return a combined system + user message
     # if we use separate, it will have double BOS in the tokens!
-    user_message = prompt.fill(datum_dict)
+    user_message = prompt.fill(datum_dict, format_as_string=True)
     message_log = [
         {
             "role": "user",
diff --git a/nemo_skills/utils.py b/nemo_skills/utils.py
index d8294649fe..18e6f63c73 100644
--- a/nemo_skills/utils.py
+++ b/nemo_skills/utils.py
@@ -32,19 +32,38 @@
 # isort: off
 import nemo_skills
 from nemo_skills.file_utils import calculate_chunk_indices, unroll_files, jdump, jload, jload_chunk, count_newlines
+
 # isort: on
 
 
-def remove_thinking(
-    sample: dict, generation_key: str = "generation", thinking_begin: str = "<think>", thinking_end: str = "</think>"
-):
-    sample["_has_think_tags"] = thinking_begin in sample[generation_key]
-    if thinking_end in sample[generation_key]:
-        sample["_full_generation"] = sample[generation_key]
-        sample[generation_key] = sample[generation_key].split(thinking_end)[-1].strip()
-    elif thinking_begin in sample[generation_key]:
-        sample["_full_generation"] = sample[generation_key]
+def get_logger_name(file):
+    if "/nemo_skills/" in file:
+        return "nemo_skills" + file.split("nemo_skills")[1].replace("/", ".").replace(".py", "")
+    else:
+        return f"[external] {Path(file).stem}"
+
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+def parse_reasoning(sample: dict, generation_key: str = "generation", end_reasoning_string: str = "</think>"):
+    # not doing anything if generation isn't a string
+    # TODO: should we be more explicit about this?
+    if not isinstance(sample[generation_key], str):
+        return
+    sample[f"_{generation_key}_finished_thinking"] = end_reasoning_string in sample[generation_key]
+    if end_reasoning_string in sample[generation_key]:
+        sample[f"_full_{generation_key}"] = sample[generation_key]
+        sample[generation_key] = sample[generation_key].split(end_reasoning_string)[-1].strip()
+    else:
+        sample[f"_full_{generation_key}"] = sample[generation_key]
         sample[generation_key] = ""  # no end tag, so setting the generation to empty
+        LOG.warning(
+            "Thinking end tag `%s` not found in generation; setting generation to empty. "
+            "If this happens for every generation, you might have accidentally set ++parse_reasoning=True for a "
+            "non-reasoning model or have incorrect end tag.",
+            end_reasoning_string,
+        )
 
 
 def nested_dataclass(*args, **kwargs):
@@ -128,13 +147,6 @@ def remove_handlers():
         logger.removeHandler(handler)
 
 
-def get_logger_name(file):
-    if "/nemo_skills/" in file:
-        return "nemo_skills" + file.split("nemo_skills")[1].replace("/", ".").replace(".py", "")
-    else:
-        return f"[external] {Path(file).stem}"
-
-
 def get_skills_root_dir():
     """Get the root directory of the NeMo Skills package."""
     return os.path.dirname(os.path.dirname(os.path.abspath(nemo_skills.__file__)))
diff --git a/recipes/openmathreasoning/scripts/simplified_recipe.py b/recipes/openmathreasoning/scripts/simplified_recipe.py
index 6b7ae5625a..f18199c4da 100644
--- a/recipes/openmathreasoning/scripts/simplified_recipe.py
+++ b/recipes/openmathreasoning/scripts/simplified_recipe.py
@@ -62,7 +62,7 @@ def run_sdg(workspace, cluster, num_gpus, expname_prefix, wandb_params):
         expname=f"{expname_prefix}-problem-extraction",
         run_after=f"{expname_prefix}-download-assets",
         model="Qwen/Qwen2.5-14B-Instruct",
-        server_type="vllm",
+        server_type="sglang",
         server_gpus=num_gpus,
         log_samples=not wandb_params["disable_wandb"],
         # using prefix as group to make it easier to see all sdg steps together
@@ -138,7 +138,7 @@ def run_training(workspace, cluster, num_gpus, expname_prefix, wandb_params):
 def final_eval(workspace, cluster, num_gpus, expname_prefix, wandb_params):
     # launching evaluation
     eval(
-        ctx=wrap_arguments("++inference.tokens_to_generate=16384 "),
+        ctx=wrap_arguments("++inference.tokens_to_generate=16384 ++parse_reasoning=True "),
         cluster=cluster,
         model=f"{workspace}/training/qwen2.5-14b-improved-hf",
         server_type="vllm",
diff --git a/recipes/openreasoning/eval.py b/recipes/openreasoning/eval.py
index 486e5cfe18..b555332c00 100644
--- a/recipes/openreasoning/eval.py
+++ b/recipes/openreasoning/eval.py
@@ -38,7 +38,7 @@
 
 def eval_aai(model_size):
     eval(
-        ctx=wrap_arguments(f"++inference.tokens_to_generate={eval_tokens} "),
+        ctx=wrap_arguments(f"++inference.tokens_to_generate={eval_tokens} ++parse_reasoning=True "),
         cluster=cluster,
         expname=f"eval-aai-{model_size}",
         output_dir=f"{output_dir}/{model_size}",
@@ -60,7 +60,9 @@ def eval_math(model_size):
         "hmmt_feb25",
     ]
     eval(
-        ctx=wrap_arguments(f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 "),
+        ctx=wrap_arguments(
+            f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6  ++parse_reasoning=True "
+        ),
         cluster=cluster,
         expname=f"eval-math-{model_size}",
         output_dir=f"{output_dir}/{model_size}",
@@ -77,6 +79,7 @@ def eval_math(model_size):
                 ctx=wrap_arguments(
                     f"++inference.tokens_to_generate={eval_tokens} "
                     "++inference.temperature=0.6 "
+                    "++parse_reasoning=True "
                     "++parallel_thinking.mode=genselect "
                     f"++parallel_thinking.generation_dir={output_dir}/{model_size}/eval-results/{bench} "
                 ),
@@ -93,7 +96,9 @@ def eval_math(model_size):
 
 def eval_code(model_size):
     eval(
-        ctx=wrap_arguments(f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 "),
+        ctx=wrap_arguments(
+            f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 ++parse_reasoning=True "
+        ),
         cluster=cluster,
         expname=f"eval-code-{model_size}",
         output_dir=f"{output_dir}/{model_size}",
@@ -108,7 +113,7 @@ def eval_code(model_size):
 def eval_science(model_size):
     eval(
         ctx=wrap_arguments(
-            f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 ++prompt_config=eval/aai/mcq-4choices-boxed "
+            f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 ++prompt_config=eval/aai/mcq-4choices-boxed ++parse_reasoning=True "
         ),
         cluster=cluster,
         expname=f"eval-gpqa-{model_size}",
@@ -120,7 +125,7 @@ def eval_science(model_size):
     )
     eval(
         ctx=wrap_arguments(
-            f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 ++prompt_config=eval/aai/mcq-10choices-boxed "
+            f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 ++prompt_config=eval/aai/mcq-10choices-boxed ++parse_reasoning=True "
         ),
         cluster=cluster,
         expname=f"eval-mmlu-pro-{model_size}",
@@ -132,7 +137,9 @@ def eval_science(model_size):
         # num_chunks=10,  # parallelize 10x for faster eval on slurm
     )
     eval(
-        ctx=wrap_arguments(f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 "),
+        ctx=wrap_arguments(
+            f"++inference.tokens_to_generate={eval_tokens} ++inference.temperature=0.6 ++parse_reasoning=True "
+        ),
         cluster=cluster,
         expname=f"eval-hle-{model_size}",
         output_dir=f"{output_dir}/{model_size}",
diff --git a/tests/data/eval_outputs/eval-results/math/output-rs0.jsonl-test b/tests/data/eval_outputs/eval-results/hendrycks_math/output-rs0.jsonl-test
similarity index 100%
rename from tests/data/eval_outputs/eval-results/math/output-rs0.jsonl-test
rename to tests/data/eval_outputs/eval-results/hendrycks_math/output-rs0.jsonl-test
diff --git a/tests/data/eval_outputs/eval-results/math/output-rs1.jsonl-test b/tests/data/eval_outputs/eval-results/hendrycks_math/output-rs1.jsonl-test
similarity index 100%
rename from tests/data/eval_outputs/eval-results/math/output-rs1.jsonl-test
rename to tests/data/eval_outputs/eval-results/hendrycks_math/output-rs1.jsonl-test
diff --git a/tests/data/eval_outputs/eval-results/math/output-rs2.jsonl-test b/tests/data/eval_outputs/eval-results/hendrycks_math/output-rs2.jsonl-test
similarity index 100%
rename from tests/data/eval_outputs/eval-results/math/output-rs2.jsonl-test
rename to tests/data/eval_outputs/eval-results/hendrycks_math/output-rs2.jsonl-test
diff --git a/tests/data/eval_outputs/eval-results/metrics-ms8192.json-test b/tests/data/eval_outputs/eval-results/metrics-ms8192.json-test
index 651da42b1d..2b089de9d6 100644
--- a/tests/data/eval_outputs/eval-results/metrics-ms8192.json-test
+++ b/tests/data/eval_outputs/eval-results/metrics-ms8192.json-test
@@ -293,156 +293,7 @@
       "no_answer": 0.0
     }
   },
-  "human-eval": {
-    "pass@1": {
-      "num_entries": 4,
-      "avg_tokens": 1792,
-      "passing_base_tests": 75.0,
-      "passing_plus_tests": 62.5
-    },
-    "pass@2": {
-      "num_entries": 4,
-      "avg_tokens": 1792,
-      "passing_base_tests": 75.0,
-      "passing_plus_tests": 75.0
-    },
-    "pass@1[avg-of-2]": {
-      "num_entries": 4,
-      "avg_tokens": 1792,
-      "passing_base_tests": 75.0,
-      "passing_plus_tests": 62.5,
-      "reasoning_tokens_statistics": {
-        "avg": 0.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "answer_tokens_statistics": {
-        "avg": 1792.5,
-        "std_dev_across_runs": 1157.8873541929715,
-        "avg_sample_std_dev": 1685.389012958136,
-        "std_err_across_runs": 818.7499999999999
-      },
-      "passing_base_tests_statistics": {
-        "avg": 0.75,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "passing_plus_tests_statistics": {
-        "avg": 0.625,
-        "std_dev_across_runs": 0.1767766952966369,
-        "avg_sample_std_dev": 0.1767766952966369,
-        "std_err_across_runs": 0.125
-      }
-    }
-  },
-  "ifeval": {
-    "pass@1": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597
-    },
-    "pass@2": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597
-    },
-    "pass@1[avg-of-2]": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597,
-      "reasoning_tokens_statistics": {
-        "avg": 0.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "answer_tokens_statistics": {
-        "avg": 2587.3333333333335,
-        "std_dev_across_runs": 532.6871084938659,
-        "avg_sample_std_dev": 1072.445284799597,
-        "std_err_across_runs": 376.66666666666674
-      },
-      "prompt_statistics": {
-        "avg": 0.3333333333333333,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "instruction_statistics": {
-        "avg": 1.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      }
-    },
-    "pass@3": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597
-    },
-    "pass@1[avg-of-3]": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597,
-      "reasoning_tokens_statistics": {
-        "avg": 0.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "answer_tokens_statistics": {
-        "avg": 2597.6666666666665,
-        "std_dev_across_runs": 377.091648158788,
-        "avg_sample_std_dev": 950.0187851733223,
-        "std_err_across_runs": 217.71396457363593
-      },
-      "prompt_statistics": {
-        "avg": 0.3333333333333333,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "instruction_statistics": {
-        "avg": 1.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      }
-    }
-  },
-  "math": {
+  "hendrycks_math": {
     "pass@1": {
       "num_entries": 7,
       "avg_tokens": 9574,
@@ -571,7 +422,7 @@
       "no_answer": 42.857142857142854
     }
   },
-  "math-aime25": {
+  "hendrycks_math-aime25": {
     "pass@1": {
       "num_entries": 5,
       "avg_tokens": 9869,
@@ -700,7 +551,7 @@
       "no_answer": 40.0
     }
   },
-  "math-aime24": {
+  "hendrycks_math-aime24": {
     "pass@1": {
       "num_entries": 2,
       "avg_tokens": 8838,
@@ -829,6 +680,155 @@
       "no_answer": 50.0
     }
   },
+  "human-eval": {
+    "pass@1": {
+      "num_entries": 4,
+      "avg_tokens": 1792,
+      "passing_base_tests": 75.0,
+      "passing_plus_tests": 62.5
+    },
+    "pass@2": {
+      "num_entries": 4,
+      "avg_tokens": 1792,
+      "passing_base_tests": 75.0,
+      "passing_plus_tests": 75.0
+    },
+    "pass@1[avg-of-2]": {
+      "num_entries": 4,
+      "avg_tokens": 1792,
+      "passing_base_tests": 75.0,
+      "passing_plus_tests": 62.5,
+      "reasoning_tokens_statistics": {
+        "avg": 0.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "answer_tokens_statistics": {
+        "avg": 1792.5,
+        "std_dev_across_runs": 1157.8873541929715,
+        "avg_sample_std_dev": 1685.389012958136,
+        "std_err_across_runs": 818.7499999999999
+      },
+      "passing_base_tests_statistics": {
+        "avg": 0.75,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "passing_plus_tests_statistics": {
+        "avg": 0.625,
+        "std_dev_across_runs": 0.1767766952966369,
+        "avg_sample_std_dev": 0.1767766952966369,
+        "std_err_across_runs": 0.125
+      }
+    }
+  },
+  "ifeval": {
+    "pass@1": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597
+    },
+    "pass@2": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597
+    },
+    "pass@1[avg-of-2]": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597,
+      "reasoning_tokens_statistics": {
+        "avg": 0.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "answer_tokens_statistics": {
+        "avg": 2587.3333333333335,
+        "std_dev_across_runs": 532.6871084938659,
+        "avg_sample_std_dev": 1072.445284799597,
+        "std_err_across_runs": 376.66666666666674
+      },
+      "prompt_statistics": {
+        "avg": 0.3333333333333333,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "instruction_statistics": {
+        "avg": 1.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      }
+    },
+    "pass@3": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597
+    },
+    "pass@1[avg-of-3]": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597,
+      "reasoning_tokens_statistics": {
+        "avg": 0.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "answer_tokens_statistics": {
+        "avg": 2597.6666666666665,
+        "std_dev_across_runs": 377.091648158788,
+        "avg_sample_std_dev": 950.0187851733223,
+        "std_err_across_runs": 217.71396457363593
+      },
+      "prompt_statistics": {
+        "avg": 0.3333333333333333,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "instruction_statistics": {
+        "avg": 1.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      }
+    }
+  },
   "minif2f": {
     "pass@1": {
       "num_entries": 3,
@@ -927,4 +927,4 @@
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/tests/data/eval_outputs/eval-results/metrics.json-test b/tests/data/eval_outputs/eval-results/metrics.json-test
index de0aaf043d..e870a6911f 100644
--- a/tests/data/eval_outputs/eval-results/metrics.json-test
+++ b/tests/data/eval_outputs/eval-results/metrics.json-test
@@ -293,156 +293,7 @@
       "no_answer": 0.0
     }
   },
-  "human-eval": {
-    "pass@1": {
-      "num_entries": 4,
-      "avg_tokens": 3840,
-      "passing_base_tests": 75.0,
-      "passing_plus_tests": 62.5
-    },
-    "pass@2": {
-      "num_entries": 4,
-      "avg_tokens": 3840,
-      "passing_base_tests": 75.0,
-      "passing_plus_tests": 75.0
-    },
-    "pass@1[avg-of-2]": {
-      "num_entries": 4,
-      "avg_tokens": 3840,
-      "passing_base_tests": 75.0,
-      "passing_plus_tests": 62.5,
-      "reasoning_tokens_statistics": {
-        "avg": 0.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "answer_tokens_statistics": {
-        "avg": 3840.5,
-        "std_dev_across_runs": 1738.422021547127,
-        "avg_sample_std_dev": 1758.9281182015368,
-        "std_err_across_runs": 1229.25
-      },
-      "passing_base_tests_statistics": {
-        "avg": 0.75,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "passing_plus_tests_statistics": {
-        "avg": 0.625,
-        "std_dev_across_runs": 0.1767766952966369,
-        "avg_sample_std_dev": 0.1767766952966369,
-        "std_err_across_runs": 0.125
-      }
-    }
-  },
-  "ifeval": {
-    "pass@1": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597
-    },
-    "pass@2": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597
-    },
-    "pass@1[avg-of-2]": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597,
-      "reasoning_tokens_statistics": {
-        "avg": 0.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "answer_tokens_statistics": {
-        "avg": 2587.3333333333335,
-        "std_dev_across_runs": 532.6871084938659,
-        "avg_sample_std_dev": 1072.445284799597,
-        "std_err_across_runs": 376.66666666666674
-      },
-      "prompt_statistics": {
-        "avg": 0.3333333333333333,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "instruction_statistics": {
-        "avg": 1.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      }
-    },
-    "pass@3": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597
-    },
-    "pass@1[avg-of-3]": {
-      "num_prompts": 3,
-      "num_instructions": 5,
-      "average_score": 46.666666666666664,
-      "prompt_strict_accuracy": 33.33333333333333,
-      "instruction_strict_accuracy": 60.0,
-      "prompt_loose_accuracy": 33.33333333333333,
-      "instruction_loose_accuracy": 60.0,
-      "num_entries": 3,
-      "avg_tokens": 2597,
-      "reasoning_tokens_statistics": {
-        "avg": 0.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "answer_tokens_statistics": {
-        "avg": 2597.6666666666665,
-        "std_dev_across_runs": 377.091648158788,
-        "avg_sample_std_dev": 950.0187851733223,
-        "std_err_across_runs": 217.71396457363593
-      },
-      "prompt_statistics": {
-        "avg": 0.3333333333333333,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      },
-      "instruction_statistics": {
-        "avg": 1.0,
-        "std_dev_across_runs": 0.0,
-        "avg_sample_std_dev": 0.0,
-        "std_err_across_runs": 0.0
-      }
-    }
-  },
-  "math": {
+  "hendrycks_math": {
     "pass@1": {
       "num_entries": 7,
       "avg_tokens": 9574,
@@ -571,7 +422,7 @@
       "no_answer": 0.0
     }
   },
-  "math-aime25": {
+  "hendrycks_math-aime25": {
     "pass@1": {
       "num_entries": 5,
       "avg_tokens": 9869,
@@ -700,7 +551,7 @@
       "no_answer": 0.0
     }
   },
-  "math-aime24": {
+  "hendrycks_math-aime24": {
     "pass@1": {
       "num_entries": 2,
       "avg_tokens": 8838,
@@ -829,6 +680,155 @@
       "no_answer": 0.0
     }
   },
+  "human-eval": {
+    "pass@1": {
+      "num_entries": 4,
+      "avg_tokens": 3840,
+      "passing_base_tests": 75.0,
+      "passing_plus_tests": 62.5
+    },
+    "pass@2": {
+      "num_entries": 4,
+      "avg_tokens": 3840,
+      "passing_base_tests": 75.0,
+      "passing_plus_tests": 75.0
+    },
+    "pass@1[avg-of-2]": {
+      "num_entries": 4,
+      "avg_tokens": 3840,
+      "passing_base_tests": 75.0,
+      "passing_plus_tests": 62.5,
+      "reasoning_tokens_statistics": {
+        "avg": 0.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "answer_tokens_statistics": {
+        "avg": 3840.5,
+        "std_dev_across_runs": 1738.422021547127,
+        "avg_sample_std_dev": 1758.9281182015368,
+        "std_err_across_runs": 1229.25
+      },
+      "passing_base_tests_statistics": {
+        "avg": 0.75,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "passing_plus_tests_statistics": {
+        "avg": 0.625,
+        "std_dev_across_runs": 0.1767766952966369,
+        "avg_sample_std_dev": 0.1767766952966369,
+        "std_err_across_runs": 0.125
+      }
+    }
+  },
+  "ifeval": {
+    "pass@1": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597
+    },
+    "pass@2": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597
+    },
+    "pass@1[avg-of-2]": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597,
+      "reasoning_tokens_statistics": {
+        "avg": 0.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "answer_tokens_statistics": {
+        "avg": 2587.3333333333335,
+        "std_dev_across_runs": 532.6871084938659,
+        "avg_sample_std_dev": 1072.445284799597,
+        "std_err_across_runs": 376.66666666666674
+      },
+      "prompt_statistics": {
+        "avg": 0.3333333333333333,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "instruction_statistics": {
+        "avg": 1.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      }
+    },
+    "pass@3": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597
+    },
+    "pass@1[avg-of-3]": {
+      "num_prompts": 3,
+      "num_instructions": 5,
+      "average_score": 46.666666666666664,
+      "prompt_strict_accuracy": 33.33333333333333,
+      "instruction_strict_accuracy": 60.0,
+      "prompt_loose_accuracy": 33.33333333333333,
+      "instruction_loose_accuracy": 60.0,
+      "num_entries": 3,
+      "avg_tokens": 2597,
+      "reasoning_tokens_statistics": {
+        "avg": 0.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "answer_tokens_statistics": {
+        "avg": 2597.6666666666665,
+        "std_dev_across_runs": 377.091648158788,
+        "avg_sample_std_dev": 950.0187851733223,
+        "std_err_across_runs": 217.71396457363593
+      },
+      "prompt_statistics": {
+        "avg": 0.3333333333333333,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      },
+      "instruction_statistics": {
+        "avg": 1.0,
+        "std_dev_across_runs": 0.0,
+        "avg_sample_std_dev": 0.0,
+        "std_err_across_runs": 0.0
+      }
+    }
+  },
   "minif2f": {
     "pass@1": {
       "num_entries": 3,
@@ -927,4 +927,4 @@
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/tests/data/eval_outputs/summarize_results_output-ms8192.txt b/tests/data/eval_outputs/summarize_results_output-ms8192.txt
index 7dc53810e2..4035bc4193 100644
--- a/tests/data/eval_outputs/summarize_results_output-ms8192.txt
+++ b/tests/data/eval_outputs/summarize_results_output-ms8192.txt
@@ -18,19 +18,7 @@ majority@4       | 4           | 5363       | 50.00%           | 0.00%
 pass@4           | 4           | 5363       | 50.00%           | 0.00%    
 
 
-------------------------------------- human-eval ------------------------------------
-evaluation_mode  | num_entries | avg_tokens | passing_base_tests | passing_plus_tests
-pass@1[avg-of-2] | 4           | 1792       | 75.00% ± 0.00%     | 62.50% ± 17.68%   
-pass@2           | 4           | 1792       | 75.00%             | 75.00%            
-
-
------------------------------------------------------------------------------------------------- ifeval ------------------------------------------------------------------------------------------------
-evaluation_mode  | num_prompts | num_instructions | average_score | prompt_strict_accuracy | instruction_strict_accuracy | prompt_loose_accuracy | instruction_loose_accuracy | num_entries | avg_tokens
-pass@1[avg-of-3] | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
-pass@3           | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
-
-
------------------------------------------ math -----------------------------------------
+------------------------------------ hendrycks_math ------------------------------------
 evaluation_mode  | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
 pass@1[avg-of-3] | 7           | 9574       | 1577595118  | 42.86% ± 14.29%  | 52.38%   
 majority@3       | 7           | 9574       | 1577595118  | 57.14%           | 42.86%   
@@ -39,7 +27,7 @@ rm_majority@3    | 7           | 9574       | 1577595118  | 57.14%           | 4
 pass@3           | 7           | 9574       | 1577595118  | 57.14%           | 42.86%   
 
 
-------------------------------------- math-aime25 --------------------------------------
+-------------------------------- hendrycks_math-aime25 ---------------------------------
 evaluation_mode  | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
 pass@1[avg-of-3] | 5           | 9869       | 60037       | 46.67% ± 11.55%  | 53.33%   
 majority@3       | 5           | 9869       | 60037       | 60.00%           | 40.00%   
@@ -48,7 +36,7 @@ rm_majority@3    | 5           | 9869       | 60037       | 60.00%           | 4
 pass@3           | 5           | 9869       | 60037       | 60.00%           | 40.00%   
 
 
-------------------------------------- math-aime24 --------------------------------------
+-------------------------------- hendrycks_math-aime24 ---------------------------------
 evaluation_mode  | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
 pass@1[avg-of-3] | 2           | 8838       | 1577595118  | 33.33% ± 28.87%  | 50.00%   
 majority@3       | 2           | 8838       | 1577595118  | 50.00%           | 50.00%   
@@ -57,6 +45,18 @@ rm_majority@3    | 2           | 8838       | 1577595118  | 50.00%           | 5
 pass@3           | 2           | 8838       | 1577595118  | 50.00%           | 50.00%   
 
 
+------------------------------------- human-eval ------------------------------------
+evaluation_mode  | num_entries | avg_tokens | passing_base_tests | passing_plus_tests
+pass@1[avg-of-2] | 4           | 1792       | 75.00% ± 0.00%     | 62.50% ± 17.68%   
+pass@2           | 4           | 1792       | 75.00%             | 75.00%            
+
+
+------------------------------------------------------------------------------------------------ ifeval ------------------------------------------------------------------------------------------------
+evaluation_mode  | num_prompts | num_instructions | average_score | prompt_strict_accuracy | instruction_strict_accuracy | prompt_loose_accuracy | instruction_loose_accuracy | num_entries | avg_tokens
+pass@1[avg-of-3] | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
+pass@3           | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
+
+
 ---------------------------------- minif2f ----------------------------------
 evaluation_mode  | num_entries | avg_tokens | lean4_correct   | timeout_error
 pass@1[avg-of-4] | 3           | 2455       | 33.33% ± 27.22% | 66.67%       
diff --git a/tests/data/eval_outputs/summarize_results_output.txt b/tests/data/eval_outputs/summarize_results_output.txt
index 7a0b30ddc5..b85e7b4ef8 100644
--- a/tests/data/eval_outputs/summarize_results_output.txt
+++ b/tests/data/eval_outputs/summarize_results_output.txt
@@ -17,19 +17,7 @@ majority@4       | 4           | 5363       | 50.00%           | 0.00%
 pass@4           | 4           | 5363       | 75.00%           | 0.00%    
 
 
-------------------------------------- human-eval ------------------------------------
-evaluation_mode  | num_entries | avg_tokens | passing_base_tests | passing_plus_tests
-pass@1[avg-of-2] | 4           | 3840       | 75.00% ± 0.00%     | 62.50% ± 17.68%   
-pass@2           | 4           | 3840       | 75.00%             | 75.00%            
-
-
------------------------------------------------------------------------------------------------- ifeval ------------------------------------------------------------------------------------------------
-evaluation_mode  | num_prompts | num_instructions | average_score | prompt_strict_accuracy | instruction_strict_accuracy | prompt_loose_accuracy | instruction_loose_accuracy | num_entries | avg_tokens
-pass@1[avg-of-3] | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
-pass@3           | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
-
-
------------------------------------------ math -----------------------------------------
+------------------------------------ hendrycks_math ------------------------------------
 evaluation_mode  | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
 pass@1[avg-of-3] | 7           | 9574       | 1577595118  | 71.43% ± 14.29%  | 0.00%    
 majority@3       | 7           | 9574       | 1577595118  | 76.19%           | 0.00%    
@@ -38,7 +26,7 @@ rm_majority@3    | 7           | 9574       | 1577595118  | 71.43%           | 0
 pass@3           | 7           | 9574       | 1577595118  | 85.71%           | 0.00%    
 
 
-------------------------------------- math-aime25 --------------------------------------
+-------------------------------- hendrycks_math-aime25 ---------------------------------
 evaluation_mode  | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
 pass@1[avg-of-3] | 5           | 9869       | 60037       | 80.00% ± 0.00%   | 0.00%    
 majority@3       | 5           | 9869       | 60037       | 80.00%           | 0.00%    
@@ -47,7 +35,7 @@ rm_majority@3    | 5           | 9869       | 60037       | 80.00%           | 0
 pass@3           | 5           | 9869       | 60037       | 80.00%           | 0.00%    
 
 
-------------------------------------- math-aime24 --------------------------------------
+-------------------------------- hendrycks_math-aime24 ---------------------------------
 evaluation_mode  | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
 pass@1[avg-of-3] | 2           | 8838       | 1577595118  | 50.00% ± 50.00%  | 0.00%    
 majority@3       | 2           | 8838       | 1577595118  | 66.67%           | 0.00%    
@@ -56,6 +44,18 @@ rm_majority@3    | 2           | 8838       | 1577595118  | 50.00%           | 0
 pass@3           | 2           | 8838       | 1577595118  | 100.00%          | 0.00%    
 
 
+------------------------------------- human-eval ------------------------------------
+evaluation_mode  | num_entries | avg_tokens | passing_base_tests | passing_plus_tests
+pass@1[avg-of-2] | 4           | 3840       | 75.00% ± 0.00%     | 62.50% ± 17.68%   
+pass@2           | 4           | 3840       | 75.00%             | 75.00%            
+
+
+------------------------------------------------------------------------------------------------ ifeval ------------------------------------------------------------------------------------------------
+evaluation_mode  | num_prompts | num_instructions | average_score | prompt_strict_accuracy | instruction_strict_accuracy | prompt_loose_accuracy | instruction_loose_accuracy | num_entries | avg_tokens
+pass@1[avg-of-3] | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
+pass@3           | 3           | 5                | 46.67%        | 33.33%                 | 60.00%                      | 33.33%                | 60.00%                     | 3           | 2597      
+
+
 ---------------------------------- minif2f ----------------------------------
 evaluation_mode  | num_entries | avg_tokens | lean4_correct   | timeout_error
 pass@1[avg-of-4] | 3           | 2455       | 33.33% ± 27.22% | 66.67%       
diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py
index 5b270c8684..462536ea8d 100644
--- a/tests/gpu-tests/test_eval.py
+++ b/tests/gpu-tests/test_eval.py
@@ -14,11 +14,13 @@
 
 import json
 import subprocess
+from importlib import import_module
 from pathlib import Path
 
 import pytest
 from utils import require_env_var
 
+from nemo_skills.pipeline.cli import eval, prepare_data, run_cmd, wrap_arguments
 from tests.conftest import docker_rm
 
 
@@ -135,6 +137,7 @@ def test_hf_eval(server_type, server_args):
         f"    --server_args='{server_args}' "
         f"    ++max_samples=164 "
         f"    ++inference.tokens_to_generate=2048 "
+        f"    ++parse_reasoning=True "
     )
     subprocess.run(cmd, shell=True, check=True)
 
@@ -209,3 +212,98 @@ def test_megatron_eval():
     # TODO: something is broken in megatron inference here as this should be 50!
     assert metrics["symbolic_correct"] >= 40
     assert metrics["num_entries"] == 5
+
+
+@pytest.mark.gpu
+def test_prepare_and_eval_all_datasets():
+    model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL")
+    model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE")
+
+    config_dir = Path(__file__).absolute().parent
+    datasets_dir = Path(__file__).absolute().parents[2] / "nemo_skills" / "dataset"
+    # not testing datasets that don't support max_samples, require explicit parameters or are very heavy to prepare
+    excluded_datasets = {
+        "__pycache__",
+        "ruler",
+        "bigcodebench",
+        "livecodebench",
+        "livebench_coding",
+        "livecodebench-pro",
+        "livecodebench-cpp",
+        "ioi24",
+        "ioi25",
+        "bfcl_v3",
+        "swe-bench",
+        "aai",
+        "human-eval",
+        "human-eval-infilling",
+        "mbpp",
+    }
+
+    dataset_names = sorted(
+        dataset.name
+        for dataset in datasets_dir.iterdir()
+        if dataset.is_dir() and (dataset / "prepare.py").exists() and dataset.name not in excluded_datasets
+    )
+
+    assert dataset_names, "No datasets found to prepare and evaluate"
+
+    judge_datasets = []
+    for dataset in dataset_names:
+        dataset_module = import_module(f"nemo_skills.dataset.{dataset}")
+        if getattr(dataset_module, "JUDGE_PIPELINE_ARGS", None):
+            judge_datasets.append(dataset)
+
+    non_judge_datasets = [dataset for dataset in dataset_names if dataset not in judge_datasets]
+
+    data_dir = Path(f"/tmp/nemo-skills-tests/{model_type}/data")
+    docker_rm([str(data_dir)])
+
+    prepare_data(
+        ctx=wrap_arguments(" ".join(dataset_names)),
+        cluster="test-local",
+        config_dir=str(config_dir),
+        data_dir=str(data_dir),
+        expname=f"prepare-all-datasets-{model_type}",
+    )
+
+    eval_kwargs = dict(
+        cluster="test-local",
+        config_dir=str(config_dir),
+        data_dir=str(data_dir),
+        model=model_path,
+        server_type="sglang",
+        server_gpus=1,
+        server_nodes=1,
+        auto_summarize_results=False,
+    )
+
+    common_ctx = "++max_samples=2 ++inference.tokens_to_generate=100 ++server.enable_soft_fail=True "
+
+    output_dir = f"/tmp/nemo-skills-tests/{model_type}/all-datasets-eval"
+    docker_rm([output_dir])
+    eval(
+        ctx=wrap_arguments(common_ctx),
+        output_dir=output_dir,
+        benchmarks=",".join(non_judge_datasets),
+        expname=f"eval-all-datasets-{model_type}",
+        **eval_kwargs,
+    )
+
+    run_cmd(
+        ctx=wrap_arguments(f"python -m nemo_skills.pipeline.summarize_results {output_dir}"),
+        cluster="test-local",
+        config_dir=str(config_dir),
+    )
+
+    eval_results_dir = Path(output_dir) / "eval-results"
+    metrics_path = eval_results_dir / "metrics.json"
+    assert metrics_path.exists(), "Missing aggregated metrics file"
+    with metrics_path.open() as f:
+        metrics = json.load(f)
+
+    for dataset in non_judge_datasets:
+        assert dataset in metrics, f"Missing metrics for {dataset}"
+
+    # TODO: add same for judge_datasets after generate supports num_jobs
+    # (otherwise it starts judge every time and takes forever)
diff --git a/tests/gpu-tests/test_generate.py b/tests/gpu-tests/test_generate.py
index 87352e679a..d286485822 100644
--- a/tests/gpu-tests/test_generate.py
+++ b/tests/gpu-tests/test_generate.py
@@ -116,9 +116,9 @@ def test_vllm_generate_seeds():
         f"    --server_nodes 1 "
         f"    --server_args '--enforce-eager' "
         f"    --num_random_seeds {num_seeds} "
-        f"    --eval_args='++eval_type=math' "
         f"    --with_sandbox "
         f"    --input_file=/nemo_run/code/nemo_skills/dataset/gsm8k/test.jsonl "
+        f"    ++eval_type=math "
         f"    ++prompt_config=generic/math "
         f"    ++max_samples=10 "
         f"    ++skip_filled=False "
diff --git a/tests/gpu-tests/test_judge.py b/tests/gpu-tests/test_judge.py
index 1c6669d176..7b8620ce07 100644
--- a/tests/gpu-tests/test_judge.py
+++ b/tests/gpu-tests/test_judge.py
@@ -29,7 +29,7 @@ def test_trtllm_judge():
     model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE")
 
     input_dir = "/nemo_run/code/tests/data"
-    output_dir = f"/tmp/nemo-skills-tests/{model_type}/judge/math"
+    output_dir = f"/tmp/nemo-skills-tests/{model_type}/judge/math-500"
 
     docker_rm([output_dir])
 
diff --git a/tests/slurm-tests/qwen3_4b_evals/run_test.py b/tests/slurm-tests/qwen3_4b_evals/run_test.py
index cbcfa46d59..7877a2a238 100644
--- a/tests/slurm-tests/qwen3_4b_evals/run_test.py
+++ b/tests/slurm-tests/qwen3_4b_evals/run_test.py
@@ -26,6 +26,7 @@ def eval_qwen3_bfcl(workspace, cluster, expname_prefix, wandb_project):
             f"++inference.top_p=0.95 "
             f"++inference.tokens_to_generate=8192 "
             f"++model_name={model} "
+            f"++parse_reasoning=True "
         ),
         cluster=cluster,
         benchmarks="bfcl_v3",
diff --git a/tests/slurm-tests/run_all.sh b/tests/slurm-tests/run_all.sh
index 7f082f90df..200cf5e3fc 100755
--- a/tests/slurm-tests/run_all.sh
+++ b/tests/slurm-tests/run_all.sh
@@ -1,16 +1,17 @@
 #!/bin/bash
 
 CLUSTER=$1
+RUN_NAME=${2:-$(date +%Y-%m-%d)}
 
-CURRENT_DATE=$(date +%Y-%m-%d)
+# TODO: change back to parallel submission after fixing https://github.com/NVIDIA-NeMo/Skills/issues/964
 
-python tests/slurm-tests/gpt_oss_python_aime25/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/gpt_oss_python_aime25 --expname_prefix gpt_oss_python_aime25_$CURRENT_DATE &
-sleep 10
-python tests/slurm-tests/super_49b_evals/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/super_49b_evals --expname_prefix super_49b_evals_$CURRENT_DATE &
-sleep 10
-python tests/slurm-tests/qwen3_4b_evals/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/qwen3_4b_evals --expname_prefix qwen3_4b_evals_$CURRENT_DATE &
-sleep 10
-python tests/slurm-tests/omr_simple_recipe/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/omr_simple_recipe/nemo-rl --expname_prefix omr_simple_recipe_nemo_rl_$CURRENT_DATE &
-sleep 10
-python tests/slurm-tests/qwen3coder_30b_swebench/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/qwen3coder_30b_swebench --expname_prefix qwen3coder_30b_swebench_$CURRENT_DATE --container_formatter '/swe-bench-images/swebench_sweb.eval.x86_64.{instance_id}.sif' &
-wait
+python tests/slurm-tests/gpt_oss_python_aime25/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$RUN_NAME/gpt_oss_python_aime25 --expname_prefix gpt_oss_python_aime25_$RUN_NAME
+# sleep 10
+python tests/slurm-tests/super_49b_evals/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$RUN_NAME/super_49b_evals --expname_prefix super_49b_evals_$RUN_NAME
+# sleep 10
+python tests/slurm-tests/qwen3_4b_evals/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$RUN_NAME/qwen3_4b_evals --expname_prefix qwen3_4b_evals_$RUN_NAME
+# sleep 10
+python tests/slurm-tests/omr_simple_recipe/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$RUN_NAME/omr_simple_recipe/nemo-rl --expname_prefix omr_simple_recipe_nemo_rl_$RUN_NAME
+# sleep 10
+python tests/slurm-tests/qwen3coder_30b_swebench/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$RUN_NAME/qwen3coder_30b_swebench --expname_prefix qwen3coder_30b_swebench_$RUN_NAME --container_formatter '/swe-bench-images/swebench_sweb.eval.x86_64.{instance_id}.sif'
+# wait
diff --git a/tests/slurm-tests/super_49b_evals/run_test.py b/tests/slurm-tests/super_49b_evals/run_test.py
index b28bdad19b..d31d62a21d 100644
--- a/tests/slurm-tests/super_49b_evals/run_test.py
+++ b/tests/slurm-tests/super_49b_evals/run_test.py
@@ -50,7 +50,7 @@ def eval_reasoning_on(workspace, cluster, expname_prefix, wandb_project):
     base_model = f"{workspace}/Llama-3_3-Nemotron-Super-49B-v1_5"
 
     # Common settings for reasoning ON
-    common_params = "++inference.temperature=0.6 ++inference.top_p=0.95 "
+    common_params = "++inference.temperature=0.6 ++inference.top_p=0.95  ++parse_reasoning=True"
     tokens_to_generate = "++inference.tokens_to_generate=65536 "
     # Math / Code / Science (Reasoning ON)
     eval(
diff --git a/tests/test_configs.py b/tests/test_configs.py
index ff4c96c15c..9494a2f369 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -51,20 +51,6 @@ def test_error_on_extra_params():
     except subprocess.CalledProcessError as e:
         assert "got an unexpected keyword argument 'num_few_shots'" in e.stderr.decode()
 
-    # sandbox.sandbox_host is not supported
-    cmd = (
-        "python nemo_skills/evaluation/evaluate_results.py "
-        "    ++input_files=./test-results/gsm8k/output.jsonl "
-        "    ++eval_type=math "
-        "    ++eval_config.sandbox.sandbox_type=local "
-        "    ++eval_config.sandbox.sandbox_host=123 "
-        "    ++remove_thinking=false "
-    )
-    try:
-        subprocess.run(cmd, shell=True, check=True, capture_output=True)
-    except subprocess.CalledProcessError as e:
-        assert "got an unexpected keyword argument 'sandbox'" in e.stderr.decode()
-
 
 @pytest.mark.parametrize(
     "mount_source, mount_dest, input_path, expected",
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index a27947a45b..cbd6c56207 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -41,7 +41,7 @@
         ],
     ),
     ("ifeval", ["test"]),
-    ("math", ["train", "test"]),
+    ("hendrycks_math", ["train", "test"]),
     ("math-odyssey", ["test"]),
     ("mawps", ["test"]),
     ("mbpp", ["test"]),