NVIDIA-NeMo · gwarmstrong · Dec 16, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/docs/evaluation/speech-audio.md b/docs/evaluation/speech-audio.md
@@ -2,8 +2,10 @@
 
 This section details how to evaluate speech and audio benchmarks, including understanding tasks that test models' ability to reason about audio content (speech, music, environmental sounds) and ASR tasks for transcription.
 
-!!! note
-    Currently supports only Megatron server type (`--server_type=megatron`).
+!!! warning "Running without audio files"
+    If you want to evaluation without audio files (not recommended) use
+    `--no-audio` flag. In this case you can also set `--skip_data_dir_check`
+    as data is very lightweight when audio files aren't being used.
 
 ## Supported benchmarks
 
@@ -35,12 +37,9 @@ MMAU-Pro (Multimodal Audio Understanding - Pro) is a comprehensive benchmark for
 
 These benchmarks require audio files for meaningful evaluation. **Audio files are downloaded by default** to ensure proper evaluation.
 
-!!! warning "Running without audio files"
-    If you want to evaluate without audio files (not recommended) use
-    `--no-audio` flag. In this case you can also set `--skip_data_dir_check`
-    as data is very lightweight when audio files aren't being used.
+### Data Preparation
 
-### ASR Leaderboard
+To prepare the dataset with audio files:
 
 ```bash
 ns prepare_data asr-leaderboard --data_dir=/path/to/data --cluster=<cluster>
@@ -55,7 +54,7 @@ ns prepare_data asr-leaderboard --datasets librispeech_clean ami
 ### MMAU-Pro
 
 ```bash
-ns prepare_data mmau-pro --data_dir=/path/to/data --cluster=<cluster_name>
+ns prepare_data mmau-pro --no-audio --skip_data_dir_check
 ```
 
 ## Running Evaluation
@@ -344,3 +343,68 @@ pass@1          | 0          | 6580        | 55.52%       | 0.00%     | 290
 evaluation_mode | avg_tokens | gen_seconds | success_rate | no_answer | num_entries
 pass@1          | 11         | 6879        | 31.44%       | 0.00%     | 5305
 ```
+
+## AudioBench
+
+AudioBench is a comprehensive benchmark for evaluating speech and audio language models across multiple tasks including ASR, translation, speech QA, and audio understanding.
+
+### Dataset Location
+
+- Benchmark is defined in [`nemo_skills/dataset/audiobench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/audiobench/__init__.py)
+- External source repository is [AudioBench](https://github.com/AudioLLMs/AudioBench)
+
+### Data Preparation
+
+AudioBench can be prepared via the NeMo-Skills data preparation entrypoint. By default it will download/copy audio files into the prepared dataset directory.
+
+```bash
+ns prepare_data audiobench --data_dir=/path/to/data --cluster=<cluster_name>
+```
+
+To prepare without saving audio files (not recommended):
+
+```bash
+ns prepare_data audiobench --no-audio --skip_data_dir_check
+```
+
+## LibriSpeech-PC
+
+LibriSpeech-PC is an Automatic Speech Recognition (ASR) benchmark that evaluates models' ability to transcribe speech with proper punctuation and capitalization. It builds upon the original LibriSpeech corpus with enhanced reference transcripts.
+
+### Dataset Location
+
+- Benchmark is defined in [`nemo_skills/dataset/librispeech-pc/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/librispeech-pc/__init__.py)
+- Manifests (with punctuation/capitalization) from [OpenSLR-145](https://www.openslr.org/145/)
+- Audio files from original [LibriSpeech OpenSLR-12](https://www.openslr.org/12/)
+
+### Available Splits
+
+- `test-clean`: Clean speech recordings (easier subset)
+- `test-other`: More challenging recordings with varied acoustic conditions
+
+## Preparing LibriSpeech-PC Data
+
+LibriSpeech-PC requires audio files for ASR evaluation. **Audio files are downloaded by default**.
+
+### Data Preparation
+
+To prepare the dataset with audio files:
+
+```bash
+ns prepare_data librispeech-pc --data_dir=/path/to/data --cluster=<cluster_name>
+```
+
+### Preparing Specific Splits
+
+To prepare only one split:
+
+```bash
+ns prepare_data librispeech-pc --split test-clean --data_dir=/path/to/data
+```
+
+or
+
+```bash
+ns prepare_data librispeech-pc --split test-other --data_dir=/path/to/data
+```
+
diff --git a/nemo_skills/dataset/audiobench/__init__.py b/nemo_skills/dataset/audiobench/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AudioBench: A comprehensive benchmark for speech and audio language models.
+
+AudioBench evaluates models across multiple tasks:
+- ASR (Automatic Speech Recognition)
+- Translation (speech-to-text translation)
+- Speech QA (question answering based on audio)
+- Audio understanding (emotion, gender, accent recognition, etc.)
+
+The benchmark is organized into two main categories:
+- nonjudge: Tasks evaluated with automatic metrics (WER, BLEU)
+- judge: Tasks requiring LLM-as-a-judge evaluation
+"""
+
+DATASET_GROUP = "speechlm"
+IS_BENCHMARK_GROUP = True
+SCORE_MODULE = "nemo_skills.evaluation.metrics.audio_metrics"
+
+# Top-level benchmarks: evaluate all judge or all nonjudge datasets
+BENCHMARKS = {
+    "audiobench.nonjudge": {},
+    "audiobench.judge": {},
+}
diff --git a/nemo_skills/dataset/audiobench/judge/__init__.py b/nemo_skills/dataset/audiobench/judge/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AudioBench judge tasks dataset configuration.
+
+This dataset includes tasks that require LLM-based evaluation such as:
+- Audio captioning
+- Spoken question answering
+- Audio understanding and reasoning
+
+These tasks require an LLM judge for evaluation, matching MMAU-Pro evaluation setup.
+"""
+
+# Dataset configuration - CRITICAL: needed for audio to work
+DATASET_GROUP = "speechlm"
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+GENERATION_ARGS = "++prompt_format=openai "
+EVAL_ARGS = "++eval_type=audio "
+
+# Judge configuration matching AudioBench official implementation
+# Using Llama-3.1-70B with vllm (can be overridden in run scripts)
+JUDGE_PIPELINE_ARGS = {
+    "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    "server_type": "vllm",
+    "server_gpus": 8,
+    "server_args": "--max-model-len 8192 --gpu-memory-utilization 0.95",
+}
+JUDGE_ARGS = "++prompt_config=judge/audiobench ++generation_key=judgement"
diff --git a/nemo_skills/dataset/audiobench/nonjudge/__init__.py b/nemo_skills/dataset/audiobench/nonjudge/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AudioBench non-judge tasks dataset configuration.
+
+This dataset includes ASR, translation, and other tasks that use
+automatic metrics (WER, BLEU, WER-PC) instead of judge evaluation.
+
+NO JUDGE REQUIRED - Metrics computed automatically from model outputs.
+"""
+
+# Dataset configuration - CRITICAL: needed for audio to work
+DATASET_GROUP = "speechlm"
+METRICS_TYPE = "audio"
+
+# Evaluation settings
+EVAL_ARGS = "++eval_type=audio "
+
+# Generation settings - OpenAI format for audio-language models
+GENERATION_ARGS = "++prompt_format=openai "