diff --git a/docs/evaluation/other-benchmarks.md b/docs/evaluation/other-benchmarks.md index d89d66c4c3..33ecc199d6 100644 --- a/docs/evaluation/other-benchmarks.md +++ b/docs/evaluation/other-benchmarks.md @@ -6,11 +6,126 @@ More details are coming soon! ### arena-hard -!!! note - For now we use v1 implementation of the arena hard! - - Benchmark is defined in [`nemo_skills/dataset/arena-hard/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/arena-hard/__init__.py) - Original benchmark source is [here](https://github.com/lmarena/arena-hard-auto). +- Uses `gpt-4.1` as the default judge model for evaluation. +- Uses `gpt-4-0314` model's responses as reference answers (baseline answers) for comparison. + +#### Data Preparation + +First, prepare the dataset by running the `ns prepare_data` command. + +```bash +ns prepare_data arena-hard +``` + +#### Running the Evaluation + +Once the data is prepared, you can run the evaluation. Replace `<...>` placeholders with your cluster and directory paths. + +```bash +export OPENAI_API_KEY=<> +ns eval \ + --cluster= \ + --model=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \ + --server_type=vllm \ + --server_gpus=8 \ + --benchmarks=arena-hard \ + --output_dir= \ + ++parse_reasoning=True \ + ++inference.temperature=0.6 \ + ++inference.top_p=0.95 \ + ++inference.tokens_to_generate=32768 +``` + +#### Verifying Results + +After all jobs are complete, you can check the results in `/eval-results/arena-hard/metrics.json`. + +``` +------------------------------------------- arena-hard ------------------------------------------- +evaluation_mode | num_entries | score | 95_CI | invalid_scores | avg_tokens | gen_seconds +pass@1 | 500 | 94.82% | (-0.67, 0.69) | 0 | 3878 | 230 +``` + +### arena-hard-v2 + +- Benchmark is defined in [`nemo_skills/dataset/arena-hard-v2/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/arena-hard-v2/__init__.py) +- Original benchmark source is [here](https://github.com/lmarena/arena-hard-auto). +- Uses `gpt-4.1` as the default judge model for evaluation. +- Uses `o3-mini-2025-01-31` model's responses as reference answers (baseline answers) for comparison. + +#### Data Preparation + +First, prepare the dataset by running the `ns prepare_data` command. + +```bash +ns prepare_data arena-hard-v2 +``` + +#### Running the Evaluation + +Once the data is prepared, you can run the evaluation. Here is an example command to reproduce Nemotron-3-Nano reported +scores. + +```bash +export OPENAI_API_KEY=<> +ns eval \ + --cluster= \ + --model=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \ + --server_type=vllm \ + --server_gpus=8 \ + --benchmarks=arena-hard-v2 \ + --output_dir= \ + ++parse_reasoning=True \ + ++inference.temperature=1.0 \ + ++inference.top_p=1.0 \ + ++inference.tokens_to_generate=65536 \ + --extra_judge_args=" \ + ++inference.tokens_to_generate=16000 \ + ++inference.temperature=0 \ + ++max_concurrent_requests=16" +``` + +#### Verifying Results + +After all jobs are complete, you can check the results in `/eval-results/arena-hard-v2/metrics.json`. + +```json +{ + "arena-hard-v2": { + "pass@1": { + "num_entries": 750, + "score": 70.25, + "95_CI": [ + -1.58, + 1.15 + ], + "invalid_scores": 1, + "avg_tokens": 4197, + "gen_seconds": 1371, + "category_hard_prompt": { + "num_entries": 500, + "score": 74.02, + "95_CI": [ + -1.88, + 1.66 + ], + "invalid_scores": 0 + }, + "category_creative_writing": { + "num_entries": 250, + "score": 63.56, + "95_CI": [ + -2.72, + 2.8 + ], + "invalid_scores": 1 + } + } + } +} +``` ### AA-Omniscience diff --git a/nemo_skills/dataset/arena-hard-v2/__init__.py b/nemo_skills/dataset/arena-hard-v2/__init__.py new file mode 100644 index 0000000000..e21faaae89 --- /dev/null +++ b/nemo_skills/dataset/arena-hard-v2/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# settings that define how evaluation should be done by default (all can be changed from cmdline) +DATASET_GROUP = "chat" +METRICS_TYPE = "arena" +# using judgement directly in metrics, no need for special evaluation +GENERATION_ARGS = "++prompt_config=generic/default" + +JUDGE_PIPELINE_ARGS = { + "generation_module": "nemo_skills.inference.eval.arena_judge", + "model": "gpt-4.1", + "server_type": "openai", + "server_address": "https://api.openai.com/v1", +} diff --git a/nemo_skills/dataset/arena-hard-v2/prepare.py b/nemo_skills/dataset/arena-hard-v2/prepare.py new file mode 100644 index 0000000000..785be7bb49 --- /dev/null +++ b/nemo_skills/dataset/arena-hard-v2/prepare.py @@ -0,0 +1,71 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import urllib.request +from pathlib import Path + +URL_QUESTIONS = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/main/data/arena-hard-v2.0/question.jsonl" +# Category-specific baselines as per official arena-hard-auto implementation +URL_BASELINE_HARD_PROMPT = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/main/data/arena-hard-v2.0/model_answer/o3-mini-2025-01-31.jsonl" +URL_BASELINE_CREATIVE_WRITING = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/main/data/arena-hard-v2.0/model_answer/gemini-2.0-flash-001.jsonl" + +# Mapping of category to baseline URL +CATEGORY_BASELINES = { + "hard_prompt": URL_BASELINE_HARD_PROMPT, + "creative_writing": URL_BASELINE_CREATIVE_WRITING, +} + + +def extract_answer_text(data): + """Extract the answer text from the baseline model's response format.""" + messages = data["messages"] + for msg in messages: + if msg["role"] == "assistant": + content = msg["content"] + return content["answer"] if isinstance(content, dict) else content + raise ValueError("No assistant message found in the data.") + + +if __name__ == "__main__": + data_dir = Path(__file__).absolute().parent + data_dir.mkdir(exist_ok=True) + questions_file = str(data_dir / "question.jsonl") + output_file = str(data_dir / "test.jsonl") + + # Download questions + urllib.request.urlretrieve(URL_QUESTIONS, questions_file) + + # Download and process all baseline files + baseline_answers = {} + for category, url in CATEGORY_BASELINES.items(): + baseline_file = str(data_dir / f"baseline_{category}.jsonl") + urllib.request.urlretrieve(url, baseline_file) + + with open(baseline_file, "rt", encoding="utf-8") as fin: + for line in fin: + data = json.loads(line) + uid = data["uid"] + if uid not in baseline_answers: + baseline_answers[uid] = {} + baseline_answers[uid][category] = extract_answer_text(data) + + # Create test.jsonl with category-specific baseline answers + with open(questions_file, "rt", encoding="utf-8") as fin, open(output_file, "wt", encoding="utf-8") as fout: + for line in fin: + data = json.loads(line) + data["question"] = data.pop("prompt") + category = data["category"] + data["baseline_answer"] = baseline_answers[data["uid"]][category] + fout.write(json.dumps(data) + "\n") diff --git a/nemo_skills/dataset/arena-hard/prepare.py b/nemo_skills/dataset/arena-hard/prepare.py index ad815a12d0..a7e2e1352a 100644 --- a/nemo_skills/dataset/arena-hard/prepare.py +++ b/nemo_skills/dataset/arena-hard/prepare.py @@ -16,9 +16,9 @@ import urllib.request from pathlib import Path -URL_QUESTIONS = "https://raw.githubusercontent.com/lm-sys/arena-hard-auto/main/data/arena-hard-v0.1/question.jsonl" +URL_QUESTIONS = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/main/data/arena-hard-v0.1/question.jsonl" URL_BASELINE = ( - "https://raw.githubusercontent.com/lm-sys/arena-hard-auto/main/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl" + "https://raw.githubusercontent.com/lmarena/arena-hard-auto/main/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl" ) diff --git a/nemo_skills/evaluation/metrics/arena_metrics.py b/nemo_skills/evaluation/metrics/arena_metrics.py index 8fe9e6970f..2b9d95c1b6 100644 --- a/nemo_skills/evaluation/metrics/arena_metrics.py +++ b/nemo_skills/evaluation/metrics/arena_metrics.py @@ -13,6 +13,7 @@ # limitations under the License. import re +from collections import defaultdict from nemo_skills.evaluation.metrics.base import BaseMetrics @@ -51,6 +52,11 @@ def update(self, predictions): super().update(predictions) self.scores.append([]) self.agg_mode = f"pass@{len(predictions)}" + + # Track category for per-category scoring (defaults to None for v1 compatibility) + category = predictions[0].get("category") + self.categories.append(category) + if len(predictions) > 1: judge_scores = [self._get_judge_score(elem["judgement-gen-base"]) for elem in predictions] # adding the best score out of all the generations @@ -86,16 +92,34 @@ def update(self, predictions): def get_metrics(self): from nemo_skills.evaluation.evaluator.arena import get_aggregate_score - metrics = {"num_entries": self.total} - metrics.update(get_aggregate_score(self.scores)) - metrics_dict = {self.agg_mode: metrics} - self.update_common_metrics(metrics_dict[self.agg_mode]) + metrics_dict = {} + + # Compute overall metrics + overall_metrics = {"num_entries": self.total} + overall_metrics.update(get_aggregate_score(self.scores)) + self.update_common_metrics(overall_metrics) + + # Group scores by category for per-category metrics + category_scores = defaultdict(list) + for score, category in zip(self.scores, self.categories, strict=True): + category_scores[category].append(score) + + # If we have multiple categories, compute per-category metrics + unique_categories = set(self.categories) + if len(unique_categories) > 1: + for category, scores in category_scores.items(): + cat_metrics = {"num_entries": len(scores)} + cat_metrics.update(get_aggregate_score(scores)) + overall_metrics[f"category_{category}"] = cat_metrics + + metrics_dict[self.agg_mode] = overall_metrics # arena metrics have their own confidence estimation, so not doing std metrics here return metrics_dict def reset(self): super().reset() self.scores = [] # list of lists + self.categories = [] # list of category strings self.lengths = 0 # TODO: the class should support pass@k, but this forces it to report as pass@1. # There is some error here for k>1 diff --git a/nemo_skills/inference/eval/arena_judge.py b/nemo_skills/inference/eval/arena_judge.py index 60daac0840..6312e01360 100644 --- a/nemo_skills/inference/eval/arena_judge.py +++ b/nemo_skills/inference/eval/arena_judge.py @@ -26,6 +26,8 @@ InferenceConfig, ) from nemo_skills.inference.model import server_params +from nemo_skills.inference.model.base import EndpointType +from nemo_skills.prompt.utils import get_prompt from nemo_skills.utils import ( get_help_message, get_logger_name, @@ -48,9 +50,15 @@ class ArenaJudgeConfig(GenerationTaskConfig): server: dict = field(default_factory=dict) # Override the default Generation config here + # prompt_config is used as the default for any category not explicitly mapped below prompt_config: str = "judge/arena" generation_key: str = "judgement" + # Category-specific prompt config overrides (arena-hard-v2 uses different prompts per category) + # Set to None to use the default prompt_config for that category + # creative_writing uses a prompt that doesn't ask the judge to generate its own answer first + prompt_config_creative: str = "judge/arena_creative" + cs = hydra.core.config_store.ConfigStore.instance() cs.store(name="base_arena_judge_config", node=ArenaJudgeConfig) @@ -60,6 +68,63 @@ class ArenaJudgeTask(GenerationTask): def __init__(self, cfg: ArenaJudgeConfig): super().__init__(cfg) + def setup_prompt(self): + if self.cfg.prompt_format == "openai": + return None + + # Load the default prompt (used for most categories including hard_prompt, arena-hard-v0.1, etc.) + default_prompt = get_prompt( + prompt_config=self.cfg.prompt_config, + tokenizer=self.tokenizer, + code_tags=self.cfg.code_tags, + examples_type=self.cfg.examples_type, + system_message=self.cfg.system_message, + ) + + # Load category-specific prompt overrides + self.category_prompts = {} + if self.cfg.prompt_config_creative: + self.category_prompts["creative_writing"] = get_prompt( + prompt_config=self.cfg.prompt_config_creative, + tokenizer=self.tokenizer, + code_tags=self.cfg.code_tags, + examples_type=self.cfg.examples_type, + system_message=self.cfg.system_message, + ) + LOG.info("Prompt used (creative_writing): %s", self.category_prompts["creative_writing"]) + # registering default prompt explicitly for hard_prompt + self.category_prompts["hard_prompt"] = default_prompt + + LOG.info("Prompt used (default): %s", default_prompt) + return default_prompt + + def fill_prompt(self, data_point, data): + """Fill prompt with category-specific prompt config.""" + if self.cfg.prompt_format == "openai": + return super().fill_prompt(data_point, data) + + # Select the appropriate prompt based on category. If not defined, forcing fall-back to default prompt + category = data_point.get("category") + if not category: + prompt = self.prompt + else: + # will fail if category not in category_prompts as this is unexpected + prompt = self.category_prompts[category] + + data_point = deepcopy(data_point) + filled_prompt = prompt.fill( + data_point, + start_assistant_response_key=self.cfg.start_assistant_response_key, + chat_template_kwargs=self.cfg.chat_template_kwargs, + format_as_string=(self.cfg.inference.endpoint_type == EndpointType.text), + ) + if self.cfg.prompt_suffix: + if isinstance(filled_prompt, list): + filled_prompt[-1]["content"] += self.cfg.prompt_suffix + else: + filled_prompt += self.cfg.prompt_suffix + return filled_prompt + def log_example_prompt(self, all_data): data_point = deepcopy(all_data[0]) diff --git a/nemo_skills/prompt/config/judge/arena_creative.yaml b/nemo_skills/prompt/config/judge/arena_creative.yaml new file mode 100644 index 0000000000..70ed7ee2a9 --- /dev/null +++ b/nemo_skills/prompt/config/judge/arena_creative.yaml @@ -0,0 +1,33 @@ +# from https://github.com/lm-sys/arena-hard-auto/blob/main/utils/judge_utils.py +# This is the creative_writing variant that does NOT ask the judge to generate its own answer first. + +system: |- + Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better. + + When evaluating the assistants' answers, compare both assistants' answers. You must identify and correct any mistakes or inaccurate information. + + Then consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive. + + Then consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt. + + After providing your explanation, you must output only one of the following choices as your final verdict with a label: + + 1. Assistant A is significantly better: [[A>>B]] + 2. Assistant A is slightly better: [[A>B]] + 3. Tie, relatively the same: [[A=B]] + 4. Assistant B is slightly better: [[B>A]] + 5. Assistant B is significantly better: [[B>>A]] + + Example output: "My final verdict is tie: [[A=B]]". + +user: |- + <|User Prompt|> + {question} + + <|The Start of Assistant A's Answer|> + {answer_1} + <|The End of Assistant A's Answer|> + + <|The Start of Assistant B's Answer|> + {answer_2} + <|The End of Assistant B's Answer|> diff --git a/tests/test_arena_metrics.py b/tests/test_arena_metrics.py new file mode 100644 index 0000000000..27cd2bab88 --- /dev/null +++ b/tests/test_arena_metrics.py @@ -0,0 +1,154 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +from nemo_skills.evaluation.metrics.arena_metrics import ArenaMetrics + + +def _make_prediction(gen_base_score, base_gen_score, category=None): + """Helper to create a prediction dict with judgment scores.""" + pred = { + "judgement-gen-base": f"[[{gen_base_score}]]", + "judgement-base-gen": f"[[{base_gen_score}]]", + } + if category is not None: + pred["category"] = category + return pred + + +def test_arena_metrics_per_category_scoring_v2(): + """Test that arena-hard-v2 with multiple categories produces per-category scores.""" + m = ArenaMetrics() + + random.seed(42) + scores_pool = [("A>B", "B>A"), ("B>A", "A>B"), ("A=B", "A=B"), ("A>>B", "B>>A"), ("B>>A", "A>>B")] + + # 50 hard_prompt entries + for _ in range(50): + score = random.choice(scores_pool) + m.update([_make_prediction(score[0], score[1], category="hard_prompt")]) + + # 25 creative_writing entries + for _ in range(25): + score = random.choice(scores_pool) + m.update([_make_prediction(score[0], score[1], category="creative_writing")]) + + assert m.total == 75 + assert set(m.categories) == {"hard_prompt", "creative_writing"} + + metrics = m.get_metrics() + + # Check overall metrics exist + assert "pass@1" in metrics + assert metrics["pass@1"]["num_entries"] == 75 + assert "score" in metrics["pass@1"] + assert "95_CI" in metrics["pass@1"] + + # Check per-category metrics exist + assert "category_hard_prompt" in metrics["pass@1"] + assert metrics["pass@1"]["category_hard_prompt"]["num_entries"] == 50 + assert "score" in metrics["pass@1"]["category_hard_prompt"] + + assert "category_creative_writing" in metrics["pass@1"] + assert metrics["pass@1"]["category_creative_writing"]["num_entries"] == 25 + assert "score" in metrics["pass@1"]["category_creative_writing"] + + +def test_arena_metrics_single_category_v1(): + """Test that arena-hard-v1 with single category does not produce per-category breakdown.""" + m = ArenaMetrics() + + random.seed(42) + scores_pool = [("A>B", "B>A"), ("B>A", "A>B"), ("A=B", "A=B"), ("A>>B", "B>>A"), ("B>>A", "A>>B")] + + # All entries have same category (v1 scenario) + for _ in range(50): + score = random.choice(scores_pool) + m.update([_make_prediction(score[0], score[1], category="arena-hard-v0.1")]) + + assert m.total == 50 + assert set(m.categories) == {"arena-hard-v0.1"} + + metrics = m.get_metrics() + + # Check overall metrics exist + assert "pass@1" in metrics + assert metrics["pass@1"]["num_entries"] == 50 + assert "score" in metrics["pass@1"] + + # Check no per-category breakdown for single category + has_category_keys = any(k.startswith("category_") for k in metrics["pass@1"].keys()) + assert not has_category_keys + + +def test_arena_metrics_legacy_data_no_category(): + """Test that legacy data without category field works correctly.""" + m = ArenaMetrics() + + random.seed(42) + scores_pool = [("A>B", "B>A"), ("B>A", "A>B"), ("A=B", "A=B")] + + # Data without category field + for _ in range(30): + score = random.choice(scores_pool) + m.update([_make_prediction(score[0], score[1])]) # No category + + assert m.total == 30 + assert set(m.categories) == {None} + + metrics = m.get_metrics() + + # Check overall metrics exist + assert "pass@1" in metrics + assert metrics["pass@1"]["num_entries"] == 30 + assert "score" in metrics["pass@1"] + + # Check no per-category breakdown + has_category_keys = any(k.startswith("category_") for k in metrics["pass@1"].keys()) + assert not has_category_keys + + +def test_arena_metrics_score_parsing(): + """Test that judge scores are correctly parsed.""" + m = ArenaMetrics() + + # Test various score formats + test_cases = [ + ("A>>B", "A>>B"), + ("A>B", "A>B"), + ("A=B", "A=B"), + ("B>A", "B>A"), + ("B>>A", "B>>A"), + ] + + for gen_base, base_gen in test_cases: + m.reset() + m.update([_make_prediction(gen_base, base_gen, category="test")]) + assert m.scores[0] == [gen_base, base_gen] + + +def test_arena_metrics_invalid_score_handling(): + """Test that invalid scores are handled correctly.""" + m = ArenaMetrics() + + # Invalid score format + pred = { + "judgement-gen-base": "No valid score here", + "judgement-base-gen": "Also invalid", + "category": "test", + } + m.update([pred]) + + assert m.scores[0] == [None, None]