diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills index 2474d850b3..3d35f10dc8 100644 --- a/dockerfiles/Dockerfile.nemo-skills +++ b/dockerfiles/Dockerfile.nemo-skills @@ -59,7 +59,7 @@ COPY pyproject.toml README.md /opt/NeMo-Skills/ COPY requirements /opt/NeMo-Skills/requirements/ # installing sdp in container only RUN pip install git+https://github.com/NVIDIA/NeMo-speech-data-processor@29b9b1ec0ceaf3ffa441c1d01297371b3f8e11d2 -ARG CACHEBUST=1 +ARG CACHEBUST=2 RUN pip install --no-cache-dir -r /opt/NeMo-Skills/requirements/main.txt # Fix http mismatch between lepton and dggs by manually downloading dggs here RUN pip install ddgs diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index 68decc1645..5be1b81ea1 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -316,6 +316,46 @@ Due to variance between runs, you can automatically repeat the evaluation and av --benchmarks=livecodebench:3 ``` +### BIRD + +The [BIRD benchmark](https://bird-bench.github.io/) is currently the only text-to-SQL benchmark that is supported. Evaluation is based on the SQL evaluation accuracy calculated in [this file](https://github.com/AlibabaResearch/DAMO-ConvAI/blob/main/bird/llm/src/evaluation.py) provided in the BIRD GitHub repository. + +#### Data Preparation + + +First, the data must be downloaded and prepared, which you can do by running: +```bash +ns prepare_data birdbench --cluster= --data_dir= +``` + +This will download and unpack a file into `/birdbench/dev_20240627`, which contains the BIRD dev manifest, table information, and database schemas. +The script will also process the original manifest into `/birdbench/dev.jsonl`, which will be the input for evaluation. +`` should be a path to the mount point where you want this data to be stored. + +See [the "Using data on cluster" documentation](./index.md#Using-data-on-cluster) for more information. + +#### Running the Evaluation + +The following command runs an evaluation of [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on a Slurm cluster. + +```bash +ns eval \ + --cluster= \ + --server_type='sglang' \ + --server_gpus=8 \ + --model=Qwen/Qwen3-8B \ + --benchmarks=birdbench \ + --data_dir= \ + --output_dir= \ + ++inference.tokens_to_generate=10000 \ + ++inference.temperature=0.6 \ + ++inference.top_p=0.95 \ + ++inference.top_k=20 \ + ++max_concurrent_requests=1024 \ +``` +You should specify: ``, which should match your cluster config name; ``, which should be the location where your dataset is mounted from the cluster; and ``. +The former two arguments should match what you used in `prepare_data`. + ### livecodebench-cpp - Benchmark is defined in [`nemo_skills/dataset/livecodebench-cpp/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/livecodebench-cpp/__init__.py) diff --git a/docs/index.md b/docs/index.md index a17dce8f81..32f216415b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,7 +16,7 @@ Here are some of the features we support: - Evaluate your models on many popular benchmarks. - [**Math (natural language**)](./evaluation/natural-math.md): e.g. [aime24](./evaluation/natural-math.md#aime24), [aime25](./evaluation/natural-math.md#aime25), [hmmt_feb25](./evaluation/natural-math.md#hmmt_feb25) - [**Math (formal language)**](./evaluation/formal-math.md): e.g. [minif2f](./evaluation/formal-math.md#minif2f), [proofnet](./evaluation/formal-math.md#proofnet), [putnam-bench](./evaluation/formal-math.md#putnam-bench) - - [**Code**](./evaluation/code.md): e.g. [swe-bench](./evaluation/code.md#swe-bench), [livecodebench](./evaluation/code.md#livecodebench) + - [**Code**](./evaluation/code.md): e.g. [swe-bench](./evaluation/code.md#swe-bench), [livecodebench](./evaluation/code.md#livecodebench), [bird](./evaluation/code.md#BIRD) - [**Scientific knowledge**](./evaluation/scientific-knowledge.md): e.g., [hle](./evaluation/scientific-knowledge.md#hle), [scicode](./evaluation/scientific-knowledge.md#scicode), [gpqa](./evaluation/scientific-knowledge.md#gpqa) - [**Instruction following**](./evaluation/instruction-following.md): e.g. [ifbench](./evaluation/instruction-following.md#ifbench), [ifeval](./evaluation/instruction-following.md#ifeval) - [**Long-context**](./evaluation/long-context.md): e.g. [ruler](./evaluation/long-context.md#ruler), [mrcr](./evaluation/long-context.md#mrcr) diff --git a/nemo_skills/dataset/birdbench/__init__.py b/nemo_skills/dataset/birdbench/__init__.py new file mode 100644 index 0000000000..55505365cc --- /dev/null +++ b/nemo_skills/dataset/birdbench/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# settings that define how evaluation should be done by default (all can be changed from cmdline) +DATASET_GROUP = "code" +METRICS_TYPE = "bird" +EVAL_SPLIT = "dev" +GENERATION_ARGS = ( + "++prompt_config=generic/text_to_sql " + "++eval_type=bird " + "++inference.tokens_to_generate=10000 " + "++inference.temperature=0.6 " + "++inference.top_p=0.95 " + "++inference.top_k=20 " + "++max_concurrent_requests=1024" +) diff --git a/nemo_skills/dataset/birdbench/prepare.py b/nemo_skills/dataset/birdbench/prepare.py new file mode 100644 index 0000000000..07c195553b --- /dev/null +++ b/nemo_skills/dataset/birdbench/prepare.py @@ -0,0 +1,126 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import re +import sqlite3 +import zipfile +from pathlib import Path + +import wget + + +def download_data(data_dir): + # Download zip directly (HF Dataset is missing SQL files and table info) + print("Downloading and extracting data file...") + url = "https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip" + filename = wget.download(url, out=data_dir) + with zipfile.ZipFile(Path(data_dir, filename), "r") as f_in: + f_in.extractall(data_dir) + + # Expand tables zipfiles + print("Extracting databases...") + dev_dir = Path(data_dir, "dev_20240627/") + dbs_zipfile = Path(dev_dir, "dev_databases.zip") + with zipfile.ZipFile(dbs_zipfile, "r") as f_dbs: + f_dbs.extractall(dev_dir) + + print("Extracted all data!") + return dev_dir + + +def read_tables_file(base_dir): + """ + Gets each db's information by using sqlite3 to get a table dump. + """ + tables_info = {} + all_db_dirs = glob.glob("*", root_dir=os.path.join(base_dir, "dev_databases")) + + for db_dir in all_db_dirs: + print(f"Reading database info from: {db_dir}") + table_info = "" + + # Grab the db's sqlite file & read the dump + full_db_dir = os.path.join(base_dir, "dev_databases", db_dir) + sqlite_file = os.path.join(full_db_dir, db_dir + ".sqlite") + assert os.path.exists(sqlite_file) + + with sqlite3.connect(os.path.join(full_db_dir, db_dir + ".sqlite")) as con: + con.text_factory = lambda b: b.decode(errors="ignore") + for line in con.iterdump(): + if line[:6] == "INSERT": + line = line.replace("\n", " ") + line = re.sub(" +", " ", line) + table_info += line + "\n" + + # Time to truncate any long INSERT chains (allow 10 max at once) + insert_chain = r"((INSERT.*$\n){10})((INSERT.*\n)*)" + table_info = re.sub(insert_chain, r"\1\n...\n", table_info, flags=re.MULTILINE) + + # Also get rid of any INSERT INTO * VALUES (...) <- lots of entries (>10) + many_values = r"(?:VALUES )(((\([^)]*)\)[,;]\s*)){10}(.*)(?:;)" + table_info = re.sub(many_values, r"...", table_info, flags=re.MULTILINE) + + tables_info[db_dir] = table_info + + return tables_info + + +def format_entries(file_path, tables_info, out_file): + """ + Combines the raw BIRD data entries with corresponding table info and + ground truth solution to form dev manifest + """ + with open(out_file, "w") as f_out: + with open(file_path, "r") as f_in: + entries = json.load(f_in) + + for i, entry in enumerate(entries): + new_entry = { + "question": entry["question"], + "gt_sql": entry["SQL"], + "sql_context": tables_info[entry["db_id"]], + "difficulty": entry["difficulty"], + "db_id": entry["db_id"], + "id": i, + } + + f_out.write(json.dumps(new_entry)) + f_out.write("\n") + + +def main(): + data_dir = str(Path(__file__).absolute().parent) + + dev_dir = download_data(data_dir) + # If already downloaded: dev_dir = Path(data_dir, "dev_20240627/") + print(f"\nData downloaded to: {dev_dir}") + + print("Starting processing...") + + # First read tables data + tables_info = read_tables_file(dev_dir) + print("Finished reading tables.") + + # Naming the input and output files the nearly same thing is likely + # confusing, but .jsonl is the expected format so we'll just + # keep the result in the upper-level directory, outside of dev_dir. + format_entries(Path(dev_dir, "dev.json"), tables_info, Path(data_dir, "dev.jsonl")) + print("Finished formatting entries. All done!") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py index d27df361c4..68527646d5 100644 --- a/nemo_skills/evaluation/evaluator/__init__.py +++ b/nemo_skills/evaluation/evaluator/__init__.py @@ -18,6 +18,7 @@ from nemo_skills.evaluation.evaluator.audio import AudioEvaluator from nemo_skills.evaluation.evaluator.base import BaseEvaluator from nemo_skills.evaluation.evaluator.bfcl import eval_bfcl +from nemo_skills.evaluation.evaluator.bird import BirdEvaluator from nemo_skills.evaluation.evaluator.code import ( CodeExecEvaluator, eval_bigcodebench, @@ -67,6 +68,7 @@ "ioi": IOIEvaluator, "icpc": ICPCEvaluator, "audio": AudioEvaluator, + "bird": BirdEvaluator, } # Validation: Ensure no overlap between class and function maps diff --git a/nemo_skills/evaluation/evaluator/bird.py b/nemo_skills/evaluation/evaluator/bird.py new file mode 100644 index 0000000000..df51c4d709 --- /dev/null +++ b/nemo_skills/evaluation/evaluator/bird.py @@ -0,0 +1,149 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import re +import sqlite3 +from pathlib import Path + +from func_timeout import FunctionTimedOut, func_timeout + +from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig +from nemo_skills.utils import nested_dataclass + +# The following code was modified from: +# https://github.com/AlibabaResearch/DAMO-ConvAI/blob/main/bird/llm/src/evaluation.py + +# Original license as follows: + +# MIT License +# +# Copyright (c) 2022 Alibaba Research +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +def execute_sql(predicted_sql, ground_truth, db_path): + # Connect to the database + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + cursor.execute(predicted_sql) + predicted_res = cursor.fetchall() + cursor.execute(ground_truth) + ground_truth_res = cursor.fetchall() + res = 0 + if set(predicted_res) == set(ground_truth_res): + res = 1 + return res + + +# ===== End of copied and modified code. ===== + + +@nested_dataclass(kw_only=True) +class BirdEvaluatorConfig(BaseEvaluatorConfig): + timeout: int = 30 + + # Answer format can be "BOXED", "CODEBLOCK", or "USE_REGEX", the last of + # which uses the given regex in the extraction_regex arg. + answer_format: str = "CODEBLOCK" + extraction_regex: str | None = None + regex_dotall: bool = False + + +class BirdEvaluator(BaseEvaluator): + def __init__(self, config: dict, num_parallel_requests=10): + super().__init__(config, num_parallel_requests) + self.eval_config = BirdEvaluatorConfig(**self.config) + + self.db_path = Path(self.eval_config.data_dir, "birdbench", "dev_20240627", "dev_databases") + + def _extract_answer(self, text): + """Uses the specified format/regex to get the answer from the output text.""" + regex = "" + dotall = False + answer_format = self.eval_config.answer_format + + if answer_format == "CODEBLOCK": + regex = r"(?:```sql)(.*?[a-zA-Z].*?)(?:```)" + dotall = True + elif answer_format == "BOXED": + regex = r"(?:boxed\{\{)(.*?[a-zA-Z].*?)(?:\}\})" + dotall = True + elif answer_format == "USE_REGEX": + regex = self.eval_config.extraction_regex + dotall = self.eval_config.regex_dotall + + if not regex: + logging.error( + "Answer format underspecified for BIRD evaluation; should be one of " + + "{CODEBLOCK, BOXED, USE_REGEX (provide extraction_regex)}.\n" + + f"Got {answer_format} instead." + ) + + # Use regex to extract answer from text + if dotall: + code_matches = re.findall(regex, text, flags=re.DOTALL) + else: + code_matches = re.findall(regex, text) + + if not code_matches: + return "SELECT 1" # No-op filler + + # Remove comments first + ans = re.sub(r"--.*?$|/\*.*?\*/", "", code_matches[-1], flags=re.DOTALL) # Use last match + # Collapse whitespace + ans = re.sub(r"\s+", " ", ans) + # Remove miscellaneous headers that snuck in + ans = re.sub(r"^\*\*.*\*\*", "", ans).strip() + + return ans + + async def eval_single(self, data_point: dict): + i = data_point["id"] + db_id = data_point["db_id"] + + # Retrieve pred and gt + predicted_sql = self._extract_answer(data_point["generation"]) + ground_truth = data_point["gt_sql"] + db_place = str(Path(self.db_path, db_id, db_id + ".sqlite")) + + try: + # Wait for result with timeout as set + res = func_timeout(self.eval_config.timeout, execute_sql, args=(predicted_sql, ground_truth, db_place)) + except FunctionTimedOut: + logging.info(f"SQL execution timed out for entry {i}") + res = 0 + except Exception as e: + logging.info(f"SQL execution failed for entry {i}:\n{e}") + res = 0 + + data_point["res"] = res + return data_point diff --git a/nemo_skills/evaluation/metrics/bird_metrics.py b/nemo_skills/evaluation/metrics/bird_metrics.py new file mode 100644 index 0000000000..24ed32a7c4 --- /dev/null +++ b/nemo_skills/evaluation/metrics/bird_metrics.py @@ -0,0 +1,77 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_skills.evaluation.metrics.base import BaseMetrics, as_float + + +class BirdMetrics(BaseMetrics): + """Metrics for BIRD text-to-SQL evaluation.""" + + def __init__(self): + super().__init__() + self.reset() + + def reset(self): + super().reset() + self.n = 0 + self.correct = 0 + self.simple_results = [] + self.moderate_results = [] + self.challenging_results = [] + + def update(self, predictions): + self.n += len(predictions) + + for pred in predictions: + # Each should be a 0 or 1 value + if pred["difficulty"] == "simple": + self.simple_results.append(pred["res"]) + elif pred["difficulty"] == "moderate": + self.moderate_results.append(pred["res"]) + elif pred["difficulty"] == "challenging": + self.challenging_results.append(pred["res"]) + + self.correct += pred["res"] + + def get_metrics(self): + sr = self.simple_results + mr = self.moderate_results + cr = self.challenging_results + + simple_acc = sum(sr) / len(sr) if sr else 0 + moderate_acc = sum(mr) / len(mr) if mr else 0 + challenging_acc = sum(cr) / len(cr) if cr else 0 + + acc = self.correct / self.n if self.n else 0 + + metrics_dict = {} + metrics_dict["total"] = { + "simple_acc": simple_acc * 100, + "moderate_acc": moderate_acc * 100, + "challenging_acc": challenging_acc * 100, + "total_acc": acc * 100, + } + return metrics_dict + + def evaluations_to_print(self): + return ["total"] + + def metrics_to_print(self): + metrics_to_print = { + "simple_acc": as_float, + "moderate_acc": as_float, + "challenging_acc": as_float, + "total_acc": as_float, + } + return metrics_to_print diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py index 94f8a4bba4..ab508c4a41 100644 --- a/nemo_skills/evaluation/metrics/map_metrics.py +++ b/nemo_skills/evaluation/metrics/map_metrics.py @@ -21,6 +21,7 @@ from nemo_skills.evaluation.metrics.arena_metrics import ArenaMetrics from nemo_skills.evaluation.metrics.audio_metrics import AudioMetrics from nemo_skills.evaluation.metrics.bfcl_metrics import BFCLMetrics +from nemo_skills.evaluation.metrics.bird_metrics import BirdMetrics from nemo_skills.evaluation.metrics.code_metrics import ( BigCodeBenchMetrics, EvalPlusMetrics, @@ -50,6 +51,7 @@ "arena": ArenaMetrics, "audio": AudioMetrics, "bfcl": BFCLMetrics, + "bird": BirdMetrics, "evalplus": EvalPlusMetrics, "if": IFMetrics, "ioi": IOIMetrics, diff --git a/nemo_skills/prompt/config/generic/text_to_sql.yaml b/nemo_skills/prompt/config/generic/text_to_sql.yaml new file mode 100644 index 0000000000..5a383c5276 --- /dev/null +++ b/nemo_skills/prompt/config/generic/text_to_sql.yaml @@ -0,0 +1,15 @@ +# Prompt used for text-to-SQL + +system: Please reason step by step, and put your final answer within the tags "```sql" and "```". + +user: |- + ### Question + {question} + + The following is a SQL dump that describes the database and the tables in it. + {sql_context} + + Convert the question above to a SQL query for the database given. Use the answer tags given. + If there is more than one set of tags, the last one will be taken as your final answer. + + ### Answer: diff --git a/requirements/main.txt b/requirements/main.txt index f140a97b50..2ebd4d3b4b 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -22,6 +22,7 @@ fire # needed local code execution server for persistent sessions flask +func-timeout # Needed for BIRD benchmark gradio httpx huggingface_hub