NVIDIA-NeMo · gwarmstrong · Jan 7, 2026 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills
@@ -59,7 +59,7 @@ COPY pyproject.toml README.md /opt/NeMo-Skills/
 COPY requirements /opt/NeMo-Skills/requirements/
 # installing sdp in container only
 RUN pip install git+https://github.com/NVIDIA/NeMo-speech-data-processor@29b9b1ec0ceaf3ffa441c1d01297371b3f8e11d2
-ARG CACHEBUST=1
+ARG CACHEBUST=2
 RUN pip install --no-cache-dir -r /opt/NeMo-Skills/requirements/main.txt
 # Fix http mismatch between lepton and dggs by manually downloading dggs here
 RUN pip install ddgs
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -316,6 +316,46 @@ Due to variance between runs, you can automatically repeat the evaluation and av
 --benchmarks=livecodebench:3
 ```
 
+### BIRD
+
+The [BIRD benchmark](https://bird-bench.github.io/) is currently the only text-to-SQL benchmark that is supported. Evaluation is based on the SQL evaluation accuracy calculated in [this file](https://github.com/AlibabaResearch/DAMO-ConvAI/blob/main/bird/llm/src/evaluation.py) provided in the BIRD GitHub repository.
+
+#### Data Preparation
+
+
+First, the data must be downloaded and prepared, which you can do by running:
+```bash
+ns prepare_data birdbench --cluster=<CLUSTER_NAME> --data_dir=<DATA_DIR>
+```
+
+This will download and unpack a file into `<DATA_DIR>/birdbench/dev_20240627`, which contains the BIRD dev manifest, table information, and database schemas.
+The script will also process the original manifest into `<DATA_DIR>/birdbench/dev.jsonl`, which will be the input for evaluation.
+`<DATA_DIR>` should be a path to the mount point where you want this data to be stored.
+
+See [the "Using data on cluster" documentation](./index.md#Using-data-on-cluster) for more information.
+
+#### Running the Evaluation
+
+The following command runs an evaluation of [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on a Slurm cluster.
+
+```bash
+ns eval \
+     --cluster=<CLUSTER_NAME> \
+     --server_type='sglang' \
+     --server_gpus=8 \
+     --model=Qwen/Qwen3-8B \
+     --benchmarks=birdbench \
+     --data_dir=<DATA_DIR> \
+     --output_dir=<OUTPUT_DIR> \
+     ++inference.tokens_to_generate=10000 \
+     ++inference.temperature=0.6 \
+     ++inference.top_p=0.95 \
+     ++inference.top_k=20 \
+     ++max_concurrent_requests=1024 \
+```
+You should specify: `<CLUSTER_NAME>`, which should match your cluster config name; `<DATA_DIR>`, which should be the location where your dataset is mounted from the cluster; and `<OUTPUT_DIR>`.
+The former two arguments should match what you used in `prepare_data`.
+
 ### livecodebench-cpp
 
 - Benchmark is defined in [`nemo_skills/dataset/livecodebench-cpp/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/livecodebench-cpp/__init__.py)

diff --git a/docs/index.md b/docs/index.md
@@ -16,7 +16,7 @@ Here are some of the features we support:
     - Evaluate your models on many popular benchmarks.
         - [**Math (natural language**)](./evaluation/natural-math.md): e.g. [aime24](./evaluation/natural-math.md#aime24), [aime25](./evaluation/natural-math.md#aime25), [hmmt_feb25](./evaluation/natural-math.md#hmmt_feb25)
         - [**Math (formal language)**](./evaluation/formal-math.md): e.g. [minif2f](./evaluation/formal-math.md#minif2f), [proofnet](./evaluation/formal-math.md#proofnet), [putnam-bench](./evaluation/formal-math.md#putnam-bench)
-        - [**Code**](./evaluation/code.md): e.g. [swe-bench](./evaluation/code.md#swe-bench), [livecodebench](./evaluation/code.md#livecodebench)
+        - [**Code**](./evaluation/code.md): e.g. [swe-bench](./evaluation/code.md#swe-bench), [livecodebench](./evaluation/code.md#livecodebench), [bird](./evaluation/code.md#BIRD)
         - [**Scientific knowledge**](./evaluation/scientific-knowledge.md): e.g., [hle](./evaluation/scientific-knowledge.md#hle), [scicode](./evaluation/scientific-knowledge.md#scicode), [gpqa](./evaluation/scientific-knowledge.md#gpqa)
         - [**Instruction following**](./evaluation/instruction-following.md): e.g. [ifbench](./evaluation/instruction-following.md#ifbench), [ifeval](./evaluation/instruction-following.md#ifeval)
         - [**Long-context**](./evaluation/long-context.md): e.g. [ruler](./evaluation/long-context.md#ruler), [mrcr](./evaluation/long-context.md#mrcr)

diff --git a/nemo_skills/dataset/birdbench/__init__.py b/nemo_skills/dataset/birdbench/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+DATASET_GROUP = "code"
+METRICS_TYPE = "bird"
+EVAL_SPLIT = "dev"
+GENERATION_ARGS = (
+    "++prompt_config=generic/text_to_sql "
+    "++eval_type=bird "
+    "++inference.tokens_to_generate=10000 "
+    "++inference.temperature=0.6 "
+    "++inference.top_p=0.95 "
+    "++inference.top_k=20 "
+    "++max_concurrent_requests=1024"
+)
diff --git a/nemo_skills/dataset/birdbench/prepare.py b/nemo_skills/dataset/birdbench/prepare.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import json
+import os
+import re
+import sqlite3
+import zipfile
+from pathlib import Path
+
+import wget
+
+
+def download_data(data_dir):
+    # Download zip directly (HF Dataset is missing SQL files and table info)
+    print("Downloading and extracting data file...")
+    url = "https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip"
+    filename = wget.download(url, out=data_dir)
+    with zipfile.ZipFile(Path(data_dir, filename), "r") as f_in:
+        f_in.extractall(data_dir)
+
+    # Expand tables zipfiles
+    print("Extracting databases...")
+    dev_dir = Path(data_dir, "dev_20240627/")
+    dbs_zipfile = Path(dev_dir, "dev_databases.zip")
+    with zipfile.ZipFile(dbs_zipfile, "r") as f_dbs:
+        f_dbs.extractall(dev_dir)
+
+    print("Extracted all data!")
+    return dev_dir
+
+
+def read_tables_file(base_dir):
+    """
+    Gets each db's information by using sqlite3 to get a table dump.
+    """
+    tables_info = {}
+    all_db_dirs = glob.glob("*", root_dir=os.path.join(base_dir, "dev_databases"))
+
+    for db_dir in all_db_dirs:
+        print(f"Reading database info from: {db_dir}")
+        table_info = ""
+
+        # Grab the db's sqlite file & read the dump
+        full_db_dir = os.path.join(base_dir, "dev_databases", db_dir)
+        sqlite_file = os.path.join(full_db_dir, db_dir + ".sqlite")
+        assert os.path.exists(sqlite_file)
+
+        with sqlite3.connect(os.path.join(full_db_dir, db_dir + ".sqlite")) as con:
+            con.text_factory = lambda b: b.decode(errors="ignore")
+            for line in con.iterdump():
+                if line[:6] == "INSERT":
+                    line = line.replace("\n", " ")
+                line = re.sub(" +", " ", line)
+                table_info += line + "\n"
+
+        # Time to truncate any long INSERT chains (allow 10 max at once)
+        insert_chain = r"((INSERT.*$\n){10})((INSERT.*\n)*)"
+        table_info = re.sub(insert_chain, r"\1\n...\n", table_info, flags=re.MULTILINE)
+
+        # Also get rid of any INSERT INTO * VALUES (...) <- lots of entries (>10)
+        many_values = r"(?:VALUES )(((\([^)]*)\)[,;]\s*)){10}(.*)(?:;)"
+        table_info = re.sub(many_values, r"...", table_info, flags=re.MULTILINE)
+
+        tables_info[db_dir] = table_info
+
+    return tables_info
+
+
+def format_entries(file_path, tables_info, out_file):
+    """
+    Combines the raw BIRD data entries with corresponding table info and
+    ground truth solution to form dev manifest
+    """
+    with open(out_file, "w") as f_out:
+        with open(file_path, "r") as f_in:
+            entries = json.load(f_in)
+
+        for i, entry in enumerate(entries):
+            new_entry = {
+                "question": entry["question"],
+                "gt_sql": entry["SQL"],
+                "sql_context": tables_info[entry["db_id"]],
+                "difficulty": entry["difficulty"],
+                "db_id": entry["db_id"],
+                "id": i,
+            }
+
+            f_out.write(json.dumps(new_entry))
+            f_out.write("\n")
+
+
+def main():
+    data_dir = str(Path(__file__).absolute().parent)
+
+    dev_dir = download_data(data_dir)
+    # If already downloaded: dev_dir = Path(data_dir, "dev_20240627/")
+    print(f"\nData downloaded to: {dev_dir}")
+
+    print("Starting processing...")
+
+    # First read tables data
+    tables_info = read_tables_file(dev_dir)
+    print("Finished reading tables.")
+
+    # Naming the input and output files the nearly same thing is likely
+    # confusing, but <split>.jsonl is the expected format so we'll just
+    # keep the result in the upper-level directory, outside of dev_dir.
+    format_entries(Path(dev_dir, "dev.json"), tables_info, Path(data_dir, "dev.jsonl"))
+    print("Finished formatting entries. All done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -18,6 +18,7 @@
 from nemo_skills.evaluation.evaluator.audio import AudioEvaluator
 from nemo_skills.evaluation.evaluator.base import BaseEvaluator
 from nemo_skills.evaluation.evaluator.bfcl import eval_bfcl
+from nemo_skills.evaluation.evaluator.bird import BirdEvaluator
 from nemo_skills.evaluation.evaluator.code import (
     CodeExecEvaluator,
     eval_bigcodebench,
@@ -67,6 +68,7 @@
     "ioi": IOIEvaluator,
     "icpc": ICPCEvaluator,
     "audio": AudioEvaluator,
+    "bird": BirdEvaluator,
 }
 
 # Validation: Ensure no overlap between class and function maps

diff --git a/nemo_skills/evaluation/evaluator/bird.py b/nemo_skills/evaluation/evaluator/bird.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import re
+import sqlite3
+from pathlib import Path
+
+from func_timeout import FunctionTimedOut, func_timeout
+
+from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig
+from nemo_skills.utils import nested_dataclass
+
+# The following code was modified from:
+# https://github.com/AlibabaResearch/DAMO-ConvAI/blob/main/bird/llm/src/evaluation.py
+
+# Original license as follows:
+
+# MIT License
+#
+# Copyright (c) 2022 Alibaba Research
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+def execute_sql(predicted_sql, ground_truth, db_path):
+    # Connect to the database
+    with sqlite3.connect(db_path) as conn:
+        cursor = conn.cursor()
+        cursor.execute(predicted_sql)
+        predicted_res = cursor.fetchall()
+        cursor.execute(ground_truth)
+        ground_truth_res = cursor.fetchall()
+        res = 0
+        if set(predicted_res) == set(ground_truth_res):
+            res = 1
+    return res
+
+
+# ===== End of copied and modified code. =====
+
+
+@nested_dataclass(kw_only=True)
+class BirdEvaluatorConfig(BaseEvaluatorConfig):
+    timeout: int = 30
+
+    # Answer format can be "BOXED", "CODEBLOCK", or "USE_REGEX", the last of
+    # which uses the given regex in the extraction_regex arg.
+    answer_format: str = "CODEBLOCK"
+    extraction_regex: str | None = None
+    regex_dotall: bool = False
+
+
+class BirdEvaluator(BaseEvaluator):
+    def __init__(self, config: dict, num_parallel_requests=10):
+        super().__init__(config, num_parallel_requests)
+        self.eval_config = BirdEvaluatorConfig(**self.config)
+
+        self.db_path = Path(self.eval_config.data_dir, "birdbench", "dev_20240627", "dev_databases")
+
+    def _extract_answer(self, text):
+        """Uses the specified format/regex to get the answer from the output text."""
+        regex = ""
+        dotall = False
+        answer_format = self.eval_config.answer_format
+
+        if answer_format == "CODEBLOCK":
+            regex = r"(?:```sql)(.*?[a-zA-Z].*?)(?:```)"
+            dotall = True
+        elif answer_format == "BOXED":
+            regex = r"(?:boxed\{\{)(.*?[a-zA-Z].*?)(?:\}\})"
+            dotall = True
+        elif answer_format == "USE_REGEX":
+            regex = self.eval_config.extraction_regex
+            dotall = self.eval_config.regex_dotall
+
+        if not regex:
+            logging.error(
+                "Answer format underspecified for BIRD evaluation; should be one of "
+                + "{CODEBLOCK, BOXED, USE_REGEX (provide extraction_regex)}.\n"
+                + f"Got {answer_format} instead."
+            )
+
+        # Use regex to extract answer from text
+        if dotall:
+            code_matches = re.findall(regex, text, flags=re.DOTALL)
+        else:
+            code_matches = re.findall(regex, text)
+
+        if not code_matches:
+            return "SELECT 1"  # No-op filler
+
+        # Remove comments first
+        ans = re.sub(r"--.*?$|/\*.*?\*/", "", code_matches[-1], flags=re.DOTALL)  # Use last match
+        # Collapse whitespace
+        ans = re.sub(r"\s+", " ", ans)
+        # Remove miscellaneous headers that snuck in
+        ans = re.sub(r"^\*\*.*\*\*", "", ans).strip()
+
+        return ans
+
+    async def eval_single(self, data_point: dict):
+        i = data_point["id"]
+        db_id = data_point["db_id"]
+
+        # Retrieve pred and gt
+        predicted_sql = self._extract_answer(data_point["generation"])
+        ground_truth = data_point["gt_sql"]
+        db_place = str(Path(self.db_path, db_id, db_id + ".sqlite"))
+
+        try:
+            # Wait for result with timeout as set
+            res = func_timeout(self.eval_config.timeout, execute_sql, args=(predicted_sql, ground_truth, db_place))
+        except FunctionTimedOut:
+            logging.info(f"SQL execution timed out for entry {i}")
+            res = 0
+        except Exception as e:
+            logging.info(f"SQL execution failed for entry {i}:\n{e}")
+            res = 0
+
+        data_point["res"] = res
+        return data_point