diff --git a/docs/evaluation/multilingual.md b/docs/evaluation/multilingual.md index aebc90ab2b..399e62291d 100644 --- a/docs/evaluation/multilingual.md +++ b/docs/evaluation/multilingual.md @@ -2,7 +2,7 @@ Our multilingual benchmarks cover things like multilingual reasoning as well as machine translation. -All benchmarks in this category will have an extra `--language` argument with its associated `ns prepare` command, which allows you to choose which language(s) of the benchmark to run. +All benchmarks in this category will have an extra `--languages` argument with its associated `ns prepare` command, which allows you to choose which language(s) of the benchmark to run. Once prepared, the `ns eval` command will run on all languages prepared, and the summarized results generated with `ns eval` will include per-language breakdowns. ## Supported benchmarks @@ -96,7 +96,7 @@ Some reference numbers for devtest split (xx corresponds to average over 5 langu --server_type=vllm \ --server_gpus=8 \ --split=devtest \ - ++inference.tokens_to_generate=512 + ++inference.tokens_to_generate=512 \ ++system_message='/no_think' ``` @@ -111,7 +111,7 @@ Some reference numbers for devtest split (xx corresponds to average over 5 langu --server_type=vllm \ --server_gpus=8 \ --split=devtest \ - ++inference.tokens_to_generate=512 + ++inference.tokens_to_generate=512 \ ++prompt_suffix='/no_think' ``` @@ -126,7 +126,7 @@ Some reference numbers for devtest split (xx corresponds to average over 5 langu --server_type=vllm \ --server_gpus=8 \ --split=devtest \ - ++inference.tokens_to_generate=512 + ++inference.tokens_to_generate=512 \ ++prompt_suffix='/no_think' ``` @@ -169,7 +169,7 @@ Some reference numbers for test split (xx corresponds to average over 5 language --server_type=vllm \ --server_gpus=8 \ --split=test \ - ++inference.tokens_to_generate=512 + ++inference.tokens_to_generate=512 \ ++system_message='/no_think' ``` @@ -184,7 +184,7 @@ Some reference numbers for test split (xx corresponds to average over 5 language --server_type=vllm \ --server_gpus=8 \ --split=test \ - ++inference.tokens_to_generate=512 + ++inference.tokens_to_generate=512 \ ++prompt_suffix='/no_think' ``` @@ -199,7 +199,7 @@ Some reference numbers for test split (xx corresponds to average over 5 language --server_type=vllm \ --server_gpus=8 \ --split=test \ - ++inference.tokens_to_generate=512 + ++inference.tokens_to_generate=512 \ ++prompt_suffix='/no_think' ``` @@ -217,6 +217,151 @@ Some reference numbers for test split (xx corresponds to average over 5 language ++inference.tokens_to_generate=2048 ``` +### mmmlu + +- Benchmark is defined in [`nemo_skills/dataset/mmmlu/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/mmmlu/__init__.py) +- Original benchmark source is [here](https://huggingface.co/datasets/openai/MMMLU). + +MMMLU is a multilingual extension of the MMLU benchmark that covers 14 languages: Arabic, Bengali, German, Spanish, French, Hindi, Indonesian, Italian, Japanese, Korean, Portuguese, Chinese, Swahili, and Yoruba. The `--include_english` flag can be used to additionally include the English split (original MMLU dataset). + +```bash +ns prepare_data mmmlu --languages ... --include_english +``` +Some reference numbers for reference and commands for reproduction: + + +| Model | Avg (14 langs) | AR-XY | DE-DE | ES-LA | FR-FR | HI-IN | IT-IT | JA-JP | KO-KR | PT-BR | ZH-CN | BN-BD | ID-ID | SW-KE | YO-NG | +| :-----------------------------: | :------------: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | +| gpt-oss-120b | **82.66** | 83.58 | 84.18 | 86.53 | 86.08 | 83.67 | 85.91 | 84.98 | 83.95 | 86.03 | 85.11 | 81.87 | 85.04 | 75.04 | 65.20 | +| Qwen3.5-122B-A10B | **87.57** | 88.62 | 89.08 | 90.10 | 89.68 | 88.11 | 89.69 | 89.27 | 88.51 | 90.09 | 89.39 | 86.56 | 89.04 | 83.65 | 74.13 | +| Nemotron-3-Super-120B-A12B-BF16 | **81.51** | 86.68 | 84.59 | 88.59 | 88.04 | 86.21 | 88.06 | 86.83 | 86.23 | 88.35 | 87.12 | 80.88 | 86.84 | 71.31 | 31.43 | + +=== "gpt-oss-120b" + + ```bash + ns eval \ + --cluster=[cluster] \ + --model=openai/gpt-oss-120b \ + --benchmarks mmmlu \ + --output_dir=[output dir] \ + --server_type=vllm \ + --server_gpus=8 \ + ++inference.tokens_to_generate=120000 \ + ++inference.temperature=1.0 \ + ++inference.top_p=1.0 \ + ++inference.reasoning_effort=high + ``` + +=== "Qwen3.5-122B-A10B" + + ```bash + ns eval \ + --cluster=[cluster] \ + --model=Qwen/Qwen3.5-122B-A10B \ + --benchmarks mmmlu \ + --output_dir=[output dir] \ + --server_type=vllm \ + --server_gpus=8 \ + --server_args='--max-model-len 262144 --reasoning-parser qwen3 --language-model-only' \ + ++chat_template_kwargs.enable_thinking=true \ + ++inference.tokens_to_generate=81920 \ + ++inference.temperature=1.0 \ + ++inference.top_p=0.95 \ + ++inference.top_k=20 \ + ++inference.repetition_penalty=1.0 + ``` + +=== "Nemotron-3-Super-120B-A12B-BF16" + + ```bash + ns eval \ + --cluster=[cluster] \ + --model=NVIDIA/Nemotron-3-Super-120B-A12B-BF16 \ + --benchmarks mmmlu \ + --output_dir=[output dir] \ + --server_type=vllm \ + --server_gpus=8 \ + --server_args='--mamba_ssm_cache_dtype float32' \ + ++chat_template_kwargs.enable_thinking=true \ + ++parse_reasoning=true \ + ++inference.tokens_to_generate=131072 \ + ++inference.temperature=1.0 \ + ++inference.top_p=0.95 + ``` + + +### Global PIQA + +- Benchmark is defined in [`nemo_skills/dataset/global_piqa/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/global_piqa/__init__.py) +- Original benchmark source is [here](https://huggingface.co/datasets/mrlbenchmarks/global-piqa-nonparallel). + +Global PIQA is a multilingual physical intuition question-answering benchmark focused on commonsense reasoning. Each question presents a situation with two solution options (A/B). It supports 116 languages. + +```bash +ns prepare_data global_piqa --languages ... +``` +Some reference numbers for reference and commands for reproduction: + +| Model | Avg (116 langs) | +| :-----------------------------: | :-------------: | +| gpt-oss-120b | **84.61** | +| Qwen3.5-122B-A10B | **88.72** | +| Nemotron-3-Super-120B-A12B-BF16 | **82.28** | + +=== "gpt-oss-120b" + + ```bash + ns eval \ + --cluster=[cluster] \ + --model=openai/gpt-oss-120b \ + --benchmarks global_piqa \ + --output_dir=[output dir] \ + --server_type=vllm \ + --server_gpus=8 \ + ++inference.tokens_to_generate=120000 \ + ++inference.temperature=1.0 \ + ++inference.top_p=1.0 \ + ++inference.reasoning_effort=high + ``` + +=== "Qwen3.5-122B-A10B" + + ```bash + ns eval \ + --cluster=[cluster] \ + --model=Qwen/Qwen3.5-122B-A10B \ + --benchmarks global_piqa \ + --output_dir=[output dir] \ + --server_type=vllm \ + --server_gpus=8 \ + --server_args='--max-model-len 262144 --reasoning-parser qwen3 --language-model-only' \ + ++chat_template_kwargs.enable_thinking=true \ + ++inference.tokens_to_generate=81920 \ + ++inference.temperature=1.0 \ + ++inference.top_p=0.95 \ + ++inference.top_k=20 \ + ++inference.repetition_penalty=1.0 + ``` + +=== "Nemotron-3-Super-120B-A12B-BF16" + + ```bash + ns eval \ + --cluster=[cluster] \ + --model=NVIDIA/Nemotron-3-Super-120B-A12B-BF16 \ + --benchmarks global_piqa \ + --output_dir=[output dir] \ + --server_type=vllm \ + --server_gpus=8 \ + --server_args='--mamba_ssm_cache_dtype float32' \ + ++chat_template_kwargs.enable_thinking=true \ + ++parse_reasoning=true \ + ++inference.tokens_to_generate=131072 \ + ++inference.temperature=1.0 \ + ++inference.top_p=0.95 + ``` + + ## Supported translation metrics By default, we compute [BLEU score](https://github.com/mjpost/sacrebleu) to evaluate machine translation. However, we also support COMET, a popular neural metric for machine translation. Computing COMET requires a separate evaluation run that uses [xCOMET-XXL](https://huggingface.co/Unbabel/XCOMET-XXL) model as a judge. This run can be scheduled by adding the following parameters to the evaluation command: diff --git a/nemo_skills/dataset/global_piqa/__init__.py b/nemo_skills/dataset/global_piqa/__init__.py new file mode 100644 index 0000000000..7eb573db87 --- /dev/null +++ b/nemo_skills/dataset/global_piqa/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "multichoice" +GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=multichoice" diff --git a/nemo_skills/dataset/global_piqa/global_piqa_utils.py b/nemo_skills/dataset/global_piqa/global_piqa_utils.py new file mode 100644 index 0000000000..f0af08392e --- /dev/null +++ b/nemo_skills/dataset/global_piqa/global_piqa_utils.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from datasets import get_dataset_config_names, load_dataset + + +def supported_languages() -> list[str]: + return get_dataset_config_names("mrlbenchmarks/global-piqa-nonparallel") + + +def load_global_piqa_datasets(languages: list[str], split: str = "test") -> dict[str, list[dict]]: + return {lang: load_dataset("mrlbenchmarks/global-piqa-nonparallel", lang)[split] for lang in languages} + + +def digit_to_letter(digit: int) -> str: + return chr(ord("A") + digit) + + +class Schema: + PROMPT = "prompt" + SOLUTION0 = "solution0" + SOLUTION1 = "solution1" + LABEL = "label" + + +# Prompt and Regex are adapted from: +# https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/global_piqa/prompted/_template + +QUERY_TEMPLATE = """ +Given the following situation, which option is more likely to be correct? + +Situation: +{prompt} ... + +Option A: {solution0} + +Option B: {solution1} + +Your response should end with "The best answer is: [answer_letter]" where [answer_letter] is one of A or B. +""".strip() + + +ANSWER_REGEX1 = r"(?i)[Tt]he (?:[Bb]est [Aa]nswer|[Ff]inal [Aa]nswer|[Aa]nswer)[^A-B]*([A-B])" +ANSWER_REGEX2 = r"(?i)[Aa]nswer\s*:[^A-B]*([A-B])" +ANSWER_REGEX3 = r"(?i)\\boxed\{([A-B])\}" + +# Fallback: greedily match everything, then capture the last A/B in the response. +# This is not in lm-evaluation-harness and is where we diverge from the original benchmark. +LETTER_REGEX = r"\b\(?\s*([A-B])\s*\)?\.?\b" +GREEDY_REGEX = r"[\s\S]*" + LETTER_REGEX +EXTRACT_REGEX = [ANSWER_REGEX1, ANSWER_REGEX2, ANSWER_REGEX3, GREEDY_REGEX] + + +def get_mcq_fields(entry: dict) -> dict: + options_dict = {digit_to_letter(i): entry[f"solution{i}"] for i in range(2)} + options_text = "\n".join(f"Option {letter}: {option}" for letter, option in options_dict.items()) + question = QUERY_TEMPLATE.format(**entry) + return {"question": question, "options": options_text, **options_dict} diff --git a/nemo_skills/dataset/global_piqa/prepare.py b/nemo_skills/dataset/global_piqa/prepare.py new file mode 100644 index 0000000000..dd4594b10f --- /dev/null +++ b/nemo_skills/dataset/global_piqa/prepare.py @@ -0,0 +1,65 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +from pathlib import Path + +from nemo_skills.dataset.global_piqa.global_piqa_utils import ( + EXTRACT_REGEX, + Schema, + digit_to_letter, + get_mcq_fields, + load_global_piqa_datasets, + supported_languages, +) + + +def format_entry(entry: dict, language: str) -> dict: + return { + "expected_answer": digit_to_letter(entry[Schema.LABEL]), + "extract_from_boxed": False, + "extract_regex": EXTRACT_REGEX, + "subset_for_metrics": language, + "relaxed": False, + **get_mcq_fields(entry), + } + + +def main(args): + invalid = set(args.languages) - set(supported_languages()) + if invalid: + raise ValueError(f"Unsupported languages: {invalid}") + datasets = load_global_piqa_datasets(args.languages) + + data_dir = Path(__file__).absolute().parent + output_file = data_dir / "test.jsonl" + with open(output_file, "wt", encoding="utf-8") as fout: + for language in datasets: + for entry in datasets[language]: + entry = format_entry(entry=entry, language=language) + json.dump(entry, fout, ensure_ascii=False) + fout.write("\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--languages", + default=supported_languages(), + nargs="+", + help="Languages to process.", + ) + args = parser.parse_args() + main(args)