diff --git a/docs/evaluation/speech-audio.md b/docs/evaluation/speech-audio.md
index 9a5f7c5251..2e170891b3 100644
--- a/docs/evaluation/speech-audio.md
+++ b/docs/evaluation/speech-audio.md
@@ -2,8 +2,10 @@
 
 This section details how to evaluate speech and audio benchmarks, including understanding tasks that test models' ability to reason about audio content (speech, music, environmental sounds) and ASR tasks for transcription.
 
-!!! note
-    Currently supports only Megatron server type (`--server_type=megatron`).
+!!! warning "Running without audio files"
+    If you want to evaluation without audio files (not recommended) use
+    `--no-audio` flag. In this case you can also set `--skip_data_dir_check`
+    as data is very lightweight when audio files aren't being used.
 
 ## Supported benchmarks
 
@@ -35,12 +37,9 @@ MMAU-Pro (Multimodal Audio Understanding - Pro) is a comprehensive benchmark for
 
 These benchmarks require audio files for meaningful evaluation. **Audio files are downloaded by default** to ensure proper evaluation.
 
-!!! warning "Running without audio files"
-    If you want to evaluate without audio files (not recommended) use
-    `--no-audio` flag. In this case you can also set `--skip_data_dir_check`
-    as data is very lightweight when audio files aren't being used.
+### Data Preparation
 
-### ASR Leaderboard
+To prepare the dataset with audio files:
 
 ```bash
 ns prepare_data asr-leaderboard --data_dir=/path/to/data --cluster=<cluster>
@@ -55,7 +54,7 @@ ns prepare_data asr-leaderboard --datasets librispeech_clean ami
 ### MMAU-Pro
 
 ```bash
-ns prepare_data mmau-pro --data_dir=/path/to/data --cluster=<cluster_name>
+ns prepare_data mmau-pro --no-audio --skip_data_dir_check
 ```
 
 ## Running Evaluation
@@ -344,3 +343,68 @@ pass@1          | 0          | 6580        | 55.52%       | 0.00%     | 290
 evaluation_mode | avg_tokens | gen_seconds | success_rate | no_answer | num_entries
 pass@1          | 11         | 6879        | 31.44%       | 0.00%     | 5305
 ```
+
+## AudioBench
+
+AudioBench is a comprehensive benchmark for evaluating speech and audio language models across multiple tasks including ASR, translation, speech QA, and audio understanding.
+
+### Dataset Location
+
+- Benchmark is defined in [`nemo_skills/dataset/audiobench/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/audiobench/__init__.py)
+- External source repository is [AudioBench](https://github.com/AudioLLMs/AudioBench)
+
+### Data Preparation
+
+AudioBench can be prepared via the NeMo-Skills data preparation entrypoint. By default it will download/copy audio files into the prepared dataset directory.
+
+```bash
+ns prepare_data audiobench --data_dir=/path/to/data --cluster=<cluster_name>
+```
+
+To prepare without saving audio files (not recommended):
+
+```bash
+ns prepare_data audiobench --no-audio --skip_data_dir_check
+```
+
+## LibriSpeech-PC
+
+LibriSpeech-PC is an Automatic Speech Recognition (ASR) benchmark that evaluates models' ability to transcribe speech with proper punctuation and capitalization. It builds upon the original LibriSpeech corpus with enhanced reference transcripts.
+
+### Dataset Location
+
+- Benchmark is defined in [`nemo_skills/dataset/librispeech-pc/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/librispeech-pc/__init__.py)
+- Manifests (with punctuation/capitalization) from [OpenSLR-145](https://www.openslr.org/145/)
+- Audio files from original [LibriSpeech OpenSLR-12](https://www.openslr.org/12/)
+
+### Available Splits
+
+- `test-clean`: Clean speech recordings (easier subset)
+- `test-other`: More challenging recordings with varied acoustic conditions
+
+## Preparing LibriSpeech-PC Data
+
+LibriSpeech-PC requires audio files for ASR evaluation. **Audio files are downloaded by default**.
+
+### Data Preparation
+
+To prepare the dataset with audio files:
+
+```bash
+ns prepare_data librispeech-pc --data_dir=/path/to/data --cluster=<cluster_name>
+```
+
+### Preparing Specific Splits
+
+To prepare only one split:
+
+```bash
+ns prepare_data librispeech-pc --split test-clean --data_dir=/path/to/data
+```
+
+or
+
+```bash
+ns prepare_data librispeech-pc --split test-other --data_dir=/path/to/data
+```
+
diff --git a/nemo_skills/dataset/audiobench/__init__.py b/nemo_skills/dataset/audiobench/__init__.py
new file mode 100644
index 0000000000..152d2ac721
--- /dev/null
+++ b/nemo_skills/dataset/audiobench/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AudioBench: A comprehensive benchmark for speech and audio language models.
+
+AudioBench evaluates models across multiple tasks:
+- ASR (Automatic Speech Recognition)
+- Translation (speech-to-text translation)
+- Speech QA (question answering based on audio)
+- Audio understanding (emotion, gender, accent recognition, etc.)
+
+The benchmark is organized into two main categories:
+- nonjudge: Tasks evaluated with automatic metrics (WER, BLEU)
+- judge: Tasks requiring LLM-as-a-judge evaluation
+"""
+
+DATASET_GROUP = "speechlm"
+IS_BENCHMARK_GROUP = True
+SCORE_MODULE = "nemo_skills.evaluation.metrics.audio_metrics"
+
+# Top-level benchmarks: evaluate all judge or all nonjudge datasets
+BENCHMARKS = {
+    "audiobench.nonjudge": {},
+    "audiobench.judge": {},
+}
diff --git a/nemo_skills/dataset/audiobench/judge/__init__.py b/nemo_skills/dataset/audiobench/judge/__init__.py
new file mode 100644
index 0000000000..62e48d4ec6
--- /dev/null
+++ b/nemo_skills/dataset/audiobench/judge/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AudioBench judge tasks dataset configuration.
+
+This dataset includes tasks that require LLM-based evaluation such as:
+- Audio captioning
+- Spoken question answering
+- Audio understanding and reasoning
+
+These tasks require an LLM judge for evaluation, matching MMAU-Pro evaluation setup.
+"""
+
+# Dataset configuration - CRITICAL: needed for audio to work
+DATASET_GROUP = "speechlm"
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+GENERATION_ARGS = "++prompt_format=openai "
+EVAL_ARGS = "++eval_type=audio "
+
+# Judge configuration matching AudioBench official implementation
+# Using Llama-3.1-70B with vllm (can be overridden in run scripts)
+JUDGE_PIPELINE_ARGS = {
+    "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    "server_type": "vllm",
+    "server_gpus": 8,
+    "server_args": "--max-model-len 8192 --gpu-memory-utilization 0.95",
+}
+JUDGE_ARGS = "++prompt_config=judge/audiobench ++generation_key=judgement"
diff --git a/nemo_skills/dataset/audiobench/nonjudge/__init__.py b/nemo_skills/dataset/audiobench/nonjudge/__init__.py
new file mode 100644
index 0000000000..d26668ce8f
--- /dev/null
+++ b/nemo_skills/dataset/audiobench/nonjudge/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AudioBench non-judge tasks dataset configuration.
+
+This dataset includes ASR, translation, and other tasks that use
+automatic metrics (WER, BLEU, WER-PC) instead of judge evaluation.
+
+NO JUDGE REQUIRED - Metrics computed automatically from model outputs.
+"""
+
+# Dataset configuration - CRITICAL: needed for audio to work
+DATASET_GROUP = "speechlm"
+METRICS_TYPE = "audio"
+
+# Evaluation settings
+EVAL_ARGS = "++eval_type=audio "
+
+# Generation settings - OpenAI format for audio-language models
+GENERATION_ARGS = "++prompt_format=openai "
diff --git a/nemo_skills/dataset/audiobench/prepare.py b/nemo_skills/dataset/audiobench/prepare.py
new file mode 100644
index 0000000000..40fb75acd7
--- /dev/null
+++ b/nemo_skills/dataset/audiobench/prepare.py
@@ -0,0 +1,606 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AudioBench Dataset Preparation for nemo-skills
+
+This script prepares AudioBench datasets for evaluation with nemo-skills.
+AudioBench is a comprehensive benchmark for evaluating speech and audio models
+across multiple tasks including ASR, translation, speech QA, and more.
+
+Usage:
+    python -m nemo_skills.dataset.audiobench.prepare --split test
+    python -m nemo_skills.dataset.audiobench.prepare --datasets librispeech_test_clean earnings21_test
+    python -m nemo_skills.dataset.audiobench.prepare --category nonjudge
+"""
+
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+from typing import Dict, List
+
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+
+# AudioBench datasets categorized by evaluation type
+JUDGE_DATASETS = [
+    "alpaca_audio_test",
+    "audiocaps_qa_test",
+    "audiocaps_test",
+    "clotho_aqa_test",
+    "cn_college_listen_mcq_test",
+    "dream_tts_mcq_test",
+    "iemocap_emotion_test",
+    "iemocap_gender_test",
+    "imda_ar_dialogue",
+    "imda_ar_sentence",
+    "imda_gr_dialogue",
+    "imda_gr_sentence",
+    "imda_part3_30s_ds_human_test",
+    "imda_part4_30s_ds_human_test",
+    "imda_part5_30s_ds_human_test",
+    "imda_part6_30s_ds_human_test",
+    "imda_part3_30s_sqa_human_test",
+    "imda_part4_30s_sqa_human_test",
+    "imda_part5_30s_sqa_human_test",
+    "imda_part6_30s_sqa_human_test",
+    "meld_emotion_test",
+    "meld_sentiment_test",
+    "mmau_mini",
+    "muchomusic_test",
+    "openhermes_audio_test",
+    "public_sg_speech_qa_test",
+    "slue_p2_sqa5_test",
+    "spoken_squad_test",
+    "voxceleb_accent_test",
+    "voxceleb_gender_test",
+    "wavcaps_qa_test",
+    "wavcaps_test",
+]
+
+NONJUDGE_DATASETS = [
+    "aishell_asr_zh_test",
+    "common_voice_15_en_test",
+    "covost2_en_id_test",
+    "covost2_en_ta_test",
+    "covost2_en_zh_test",
+    "covost2_id_en_test",
+    "covost2_ta_en_test",
+    "covost2_zh_en_test",
+    "earnings21_test",
+    "earnings22_test",
+    "gigaspeech_test",
+    "gigaspeech2_indo",
+    "gigaspeech2_thai",
+    "gigaspeech2_viet",
+    "imda_part1_asr_test",
+    "imda_part2_asr_test",
+    "imda_part3_30s_asr_test",
+    "imda_part4_30s_asr_test",
+    "imda_part5_30s_asr_test",
+    "imda_part6_30s_asr_test",
+    "librispeech_test_clean",
+    "librispeech_test_other",
+    "peoples_speech_test",
+    "seame_dev_man",
+    "seame_dev_sge",
+    "spoken-mqa_long_digit",
+    "spoken-mqa_multi_step_reasoning",
+    "spoken-mqa_short_digit",
+    "spoken-mqa_single_step_reasoning",
+    "tedlium3_test",
+    "tedlium3_long_form_test",
+]
+
+
+def get_audio_duration(audio_array: np.ndarray, sampling_rate: int) -> float:
+    """Compute audio duration in seconds from array and sampling rate."""
+    if audio_array is None or len(audio_array) == 0:
+        return 0.0
+    return float(len(audio_array) / sampling_rate)
+
+
+def save_audio_file(audio_array: np.ndarray, sampling_rate: int, output_path: str):
+    """Save audio array to WAV file."""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    sf.write(output_path, audio_array, sampling_rate)
+
+
+def extract_audio_dict(sample: Dict) -> Dict | None:
+    """Extract an Audio feature dict from a HuggingFace sample.
+
+    AudioLLMs-hosted AudioBench datasets commonly store audio under the `context`
+    column (HF Audio feature), while other sources may use `audio`.
+    """
+    # Prefer official HF Audio feature columns if present
+    for key in ("context", "audio"):
+        audio_dict = sample.get(key)
+        if isinstance(audio_dict, dict):
+            return audio_dict
+    return None
+
+
+def create_manifest_entry(
+    sample: Dict,
+    audio_filename: str,
+    duration: float,
+    dataset_name: str,
+    sample_id: int,
+    category: str,
+) -> Dict:
+    """Create a nemo-skills compatible manifest entry.
+
+    Args:
+        sample: Raw sample from AudioBench dataset
+        audio_filename: Audio filename (relative path within audiobench directory)
+        duration: Audio duration in seconds
+        dataset_name: Name of the dataset
+        sample_id: Sample index
+        category: Category (judge/nonjudge)
+
+    Returns:
+        Manifest entry dict with proper format for nemo-skills
+    """
+    instruction = sample.get("instruction", sample.get("text", "Process the audio"))
+    reference = sample.get("reference", sample.get("answer", ""))
+    task_type = sample.get("task_type", "unknown")
+
+    # Create absolute audio path with /data/ prefix for cluster deployment
+    # Format: /data/audiobench/{category}/audio/{dataset_name}/{filename}
+    audio_rel_path = f"/data/audiobench/{category}/audio/{dataset_name}/{audio_filename}"
+
+    # Create audio metadata (both singular and plural forms for compatibility)
+    audio_metadata = {"path": audio_rel_path, "duration": duration}
+
+    entry = {
+        "expected_answer": reference,
+        "audio_path": [audio_rel_path],
+        # Used by audio metrics to decide whether to parse LLM-as-a-judge results.
+        # AudioBench "judge" datasets are open-ended (judged), while "nonjudge" datasets are closed-form.
+        "category": "open" if category == "judge" else ("closed" if category == "nonjudge" else category),
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant. /no_think"},
+            {
+                "role": "user",
+                "content": instruction,
+                "audio": audio_metadata,
+                "audios": [audio_metadata],
+            },
+        ],
+        "dataset": dataset_name,
+        "subset_for_metrics": dataset_name,
+        "sample_id": sample_id,
+        "task_type": task_type,
+        "question": instruction,
+    }
+
+    for key in [
+        "choices",
+        "options",
+        "audio_text_instruction",
+        "audio_gt",
+        "dimension",
+        "rule_type",
+        "rule_target",
+        "task",
+    ]:
+        if key in sample:
+            entry[key] = sample[key]
+
+    return entry
+
+
+def process_dataset(
+    dataset_name: str,
+    output_dir: Path,
+    save_audio: bool = True,
+    split: str = "test",
+    max_samples: int = -1,
+) -> tuple[int, List[Dict]]:
+    """Process a single AudioBench dataset.
+
+    Args:
+        dataset_name: Name of the dataset to process
+        output_dir: Base output directory
+        save_audio: Whether to save audio files
+        split: Dataset split (default: "test")
+        max_samples: Max number of samples to process (-1 for all)
+
+    Returns:
+        Tuple of (num_samples, manifest_entries)
+    """
+    print(f"\n{'=' * 60}")
+    print(f"Processing: {dataset_name}")
+    print(f"{'=' * 60}")
+
+    try:
+        from datasets import load_dataset
+    except Exception as e:
+        raise ImportError(
+            f"Failed to import HuggingFace 'datasets'. Please ensure it is installed.\nOriginal error: {e}"
+        )
+
+    # Upstream reference: https://github.com/AudioLLMs/AudioBench
+    try:
+        # AudioBench mapping for datasets that are not 1:1 AudioLLMs/<dataset_name>.
+        hf_map = {
+            # AudioLLMs org aliases
+            "aishell_asr_zh_test": {"repo": "AudioLLMs/aishell_1_zh_test", "split": "test"},
+            "muchomusic_test": {"repo": "AudioLLMs/mu_chomusic_test", "split": "test"},
+            "openhermes_audio_test": {"repo": "AudioLLMs/openhermes_instruction_test", "split": "test"},
+            "iemocap_emotion_test": {"repo": "AudioLLMs/iemocap_emotion_recognition", "split": "test"},
+            "iemocap_gender_test": {"repo": "AudioLLMs/iemocap_gender_recognition", "split": "test"},
+            "mmau_mini": {
+                "repo": "AudioLLMs/MMAU-mini",
+                "split": "test",
+                "fallback_repo": "AudioLLMs/MMAU-mini-do-not-use",
+            },
+            # GigaSpeech2 variants (one repo with data_dir selector)
+            "gigaspeech2_thai": {"repo": "AudioLLMs/gigaspeech2-test", "split": "train", "data_dir": "th-test"},
+            "gigaspeech2_indo": {"repo": "AudioLLMs/gigaspeech2-test", "split": "train", "data_dir": "id-test"},
+            "gigaspeech2_viet": {"repo": "AudioLLMs/gigaspeech2-test", "split": "train", "data_dir": "vi-test"},
+            "spoken-mqa_short_digit": {"repo": "amao0o0/spoken-mqa", "split": "short_digit"},
+            "spoken-mqa_long_digit": {"repo": "amao0o0/spoken-mqa", "split": "long_digit"},
+            "spoken-mqa_single_step_reasoning": {"repo": "amao0o0/spoken-mqa", "split": "single_step_reasoning"},
+            "spoken-mqa_multi_step_reasoning": {"repo": "amao0o0/spoken-mqa", "split": "multi_step_reasoning"},
+            "imda_part1_asr_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "ASR-PART1-Test",
+            },
+            "imda_part2_asr_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "ASR-PART2-Test",
+            },
+            "imda_part3_30s_asr_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "ASR-PART3-Test",
+            },
+            "imda_part4_30s_asr_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "ASR-PART4-Test",
+            },
+            "imda_part5_30s_asr_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "ASR-PART5-Test",
+            },
+            "imda_part6_30s_asr_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "ASR-PART6-Test",
+            },
+            "imda_part3_30s_sqa_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SQA-PART3-Test",
+            },
+            "imda_part4_30s_sqa_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SQA-PART4-Test",
+            },
+            "imda_part5_30s_sqa_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SQA-PART5-Test",
+            },
+            "imda_part6_30s_sqa_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SQA-PART6-Test",
+            },
+            "imda_part3_30s_ds_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SDS-PART3-Test",
+            },
+            "imda_part4_30s_ds_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SDS-PART4-Test",
+            },
+            "imda_part5_30s_ds_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SDS-PART5-Test",
+            },
+            "imda_part6_30s_ds_human_test": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "SDS-PART6-Test",
+            },
+            "imda_ar_sentence": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "PQA-AR-Sentence-Test",
+            },
+            "imda_ar_dialogue": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "PQA-AR-Dialogue-Test",
+            },
+            "imda_gr_sentence": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "PQA-GR-Sentence-Test",
+            },
+            "imda_gr_dialogue": {
+                "repo": "MERaLiON/Multitask-National-Speech-Corpus-v1",
+                "split": "train",
+                "data_dir": "PQA-GR-Dialogue-Test",
+            },
+        }
+
+        spec = hf_map.get(dataset_name)
+        if spec is None:
+            hf_repo = f"AudioLLMs/{dataset_name}"
+            hf_split = split
+            hf_ds = load_dataset(hf_repo, split=hf_split)
+        else:
+            hf_repo = spec["repo"]
+            hf_split = spec.get("split", split)
+            data_dir = spec.get("data_dir")
+            if data_dir:
+                hf_ds = load_dataset(hf_repo, data_dir=data_dir, split=hf_split)
+            else:
+                hf_ds = load_dataset(hf_repo, split=hf_split)
+
+            fallback_repo = spec.get("fallback_repo")
+            if fallback_repo:
+                # Only try fallback if the primary repo is missing/inaccessible.
+                # (Keep behavior deterministic and close to upstream mapping.)
+                try:
+                    _ = len(hf_ds)
+                except Exception:
+                    hf_repo = fallback_repo
+                    hf_ds = load_dataset(hf_repo, split=hf_split)
+
+        if max_samples is not None and int(max_samples) > 0:
+            hf_ds = hf_ds.select(range(min(int(max_samples), len(hf_ds))))
+        data_samples = hf_ds
+        print(f"Loaded {len(hf_ds)} samples via HuggingFace datasets: {hf_repo} (split={hf_split})")
+    except Exception as e:
+        raise Exception(
+            "Failed to load AudioBench dataset via HuggingFace.\n"
+            f"- Requested dataset_name: {dataset_name}\n"
+            f"- HuggingFace dataset repo attempted: {locals().get('hf_repo', 'UNKNOWN')}\n"
+            f"- Split: {locals().get('hf_split', split)}\n"
+            "Please verify the dataset exists under the AudioLLMs org:\n"
+            "  https://huggingface.co/AudioLLMs/datasets\n"
+            f"Original error: {e}"
+        )
+
+    # Determine category
+    dataset_base = dataset_name.replace("_test", "")
+    if dataset_name in JUDGE_DATASETS or dataset_base in JUDGE_DATASETS:
+        category = "judge"
+    elif dataset_name in NONJUDGE_DATASETS or dataset_base in NONJUDGE_DATASETS:
+        category = "nonjudge"
+    else:
+        category = "unknown"
+
+    # Output directories
+    audio_dir = output_dir / category / "audio" / dataset_name
+    dataset_dir = output_dir / category / dataset_name
+    os.makedirs(audio_dir, exist_ok=True)
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    # Copy __init__.py from category folder to dataset folder
+    category_init = output_dir / category / "__init__.py"
+    dataset_init = dataset_dir / "__init__.py"
+    if category_init.exists() and not dataset_init.exists():
+        shutil.copy2(category_init, dataset_init)
+        print(f"✓ Copied __init__.py to {dataset_dir}")
+
+    manifest_entries = []
+    successful = 0
+    failed = 0
+
+    for idx, sample in enumerate(tqdm(data_samples, desc=f"Processing {dataset_name}")):
+        try:
+            # Get audio data
+            audio_dict = extract_audio_dict(sample)
+            if audio_dict is None:
+                print(f"Warning: Sample {idx} has no audio, skipping")
+                failed += 1
+                continue
+
+            # Extract audio array and sampling rate
+            audio_array = audio_dict.get("array")
+            sampling_rate = audio_dict.get("sampling_rate", 16000)
+
+            if audio_array is None or len(audio_array) == 0:
+                print(f"Warning: Empty audio at sample {idx}, skipping")
+                failed += 1
+                continue
+
+            # Convert to numpy array if needed
+            if isinstance(audio_array, list):
+                audio_array = np.array(audio_array)
+
+            # Compute duration
+            duration = get_audio_duration(audio_array, sampling_rate)
+
+            # Define audio file paths
+            audio_filename = f"{dataset_name}_{idx:06d}.wav"
+            local_audio_path = audio_dir / audio_filename
+
+            # Save audio file
+            if save_audio:
+                try:
+                    save_audio_file(audio_array, sampling_rate, str(local_audio_path))
+                except Exception as e:
+                    print(f"Warning: Failed to save audio for sample {idx}: {e}")
+                    failed += 1
+                    continue
+
+            # Create manifest entry with relative path
+            entry = create_manifest_entry(
+                sample=sample,
+                audio_filename=audio_filename,
+                duration=duration,
+                dataset_name=dataset_name,
+                sample_id=idx,
+                category=category,
+            )
+
+            manifest_entries.append(entry)
+            successful += 1
+
+        except Exception as e:
+            print(f"Error processing sample {idx}: {e}")
+            failed += 1
+            continue
+
+    # Save dataset-specific manifest to dataset directory
+    manifest_path = dataset_dir / f"{split}.jsonl"
+    with open(manifest_path, "w", encoding="utf-8") as f:
+        for entry in manifest_entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+    print(f"✓ Saved {successful} samples to {manifest_path}")
+    if failed > 0:
+        print(f"✗ Failed to process {failed} samples")
+
+    return successful, manifest_entries
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Prepare AudioBench datasets for nemo-skills evaluation")
+    parser.add_argument(
+        "--split",
+        default="test",
+        choices=["train", "validation", "test"],
+        help="Dataset split to prepare",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output directory (defaults to $NEMO_SKILLS_DATA_DIR/audiobench)",
+    )
+    parser.add_argument(
+        "--datasets",
+        nargs="+",
+        help="Specific dataset(s) to process (e.g., librispeech_test_clean earnings21)",
+    )
+    parser.add_argument(
+        "--category",
+        choices=["judge", "nonjudge", "all"],
+        default="all",
+        help="Process only judge, nonjudge, or all datasets",
+    )
+    parser.add_argument(
+        "--no-audio",
+        dest="save_audio",
+        action="store_false",
+        help="Skip saving audio files (only create manifests)",
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=-1,
+        help="Maximum number of samples to process per dataset (-1 for all)",
+    )
+    parser.set_defaults(save_audio=True)
+
+    args = parser.parse_args()
+
+    # Determine output directory
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+    else:
+        # Use dataset directory as output (files will be in nemo_skills/dataset/audiobench/)
+        output_dir = Path(__file__).parent
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print("\n" + "=" * 60)
+    print("AudioBench Dataset Preparation")
+    print("=" * 60)
+    print("AudioBench source: HuggingFace datasets (AudioLLMs/AudioBench)")
+    print(f"Output directory: {output_dir}")
+    print(f"Save audio files: {args.save_audio}")
+    print(f"Split: {args.split}")
+    print("=" * 60 + "\n")
+
+    # Determine which datasets to process
+    if args.datasets:
+        target_datasets = args.datasets
+    else:
+        all_datasets = JUDGE_DATASETS + NONJUDGE_DATASETS
+        if args.category == "judge":
+            target_datasets = JUDGE_DATASETS
+        elif args.category == "nonjudge":
+            target_datasets = NONJUDGE_DATASETS
+        else:  # all
+            target_datasets = all_datasets
+
+    # Initialize category folders with __init__.py for nemo-skills to find dataset defaults
+    for category in ["judge", "nonjudge"]:
+        category_dir = output_dir / category
+        category_dir.mkdir(exist_ok=True)
+
+        # Copy category __init__.py
+        init_file = category_dir / "__init__.py"
+        template_init = output_dir / category / "__init__.py"
+        if not init_file.exists() and template_init.exists():
+            shutil.copy2(template_init, init_file)
+
+    total_samples = 0
+    total_datasets = 0
+
+    for name in target_datasets:
+        # Normalize dataset name: allow passing without _test suffix
+        dataset_name = name
+        if dataset_name not in JUDGE_DATASETS and dataset_name not in NONJUDGE_DATASETS:
+            # Try adding _test suffix (AudioBench uses mixed naming)
+            if f"{dataset_name}_test" in JUDGE_DATASETS or f"{dataset_name}_test" in NONJUDGE_DATASETS:
+                dataset_name = f"{dataset_name}_test"
+
+        # Determine category for logging
+        category = "judge" if name in JUDGE_DATASETS else "nonjudge"
+
+        try:
+            num_samples, _ = process_dataset(
+                dataset_name=dataset_name,
+                output_dir=output_dir,
+                save_audio=args.save_audio,
+                split=args.split,
+                max_samples=args.max_samples,
+            )
+            total_samples += num_samples
+            total_datasets += 1
+            print(f"✓ Completed {dataset_name}: {num_samples} samples")
+        except Exception as e:
+            print(f"✗ Failed {dataset_name}: {e}")
+            continue
+
+    print("\n" + "=" * 60)
+    print("AudioBench Preparation Summary")
+    print("=" * 60)
+    print(f"Datasets processed: {total_datasets}/{len(target_datasets)}")
+    print(f"Total samples: {total_samples}")
+    print(f"Output directory: {output_dir}")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo_skills/dataset/librispeech-pc/__init__.py b/nemo_skills/dataset/librispeech-pc/__init__.py
new file mode 100644
index 0000000000..28b02d9656
--- /dev/null
+++ b/nemo_skills/dataset/librispeech-pc/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LibriSpeech-PC: ASR evaluation with Punctuation and Capitalization.
+
+Test sets (evaluation only):
+- test-clean: Clean speech recordings (~2.6k samples)
+- test-other: More challenging speech with various acoustic conditions (~2.9k samples)
+"""
+
+DATASET_GROUP = "speechlm"
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test-clean"
+
+
+EVAL_SPLIT = "test-clean"
+EVAL_ARGS = "++eval_type=audio "
+GENERATION_ARGS = "++prompt_format=openai "
diff --git a/nemo_skills/dataset/librispeech-pc/prepare.py b/nemo_skills/dataset/librispeech-pc/prepare.py
new file mode 100644
index 0000000000..a260d864c3
--- /dev/null
+++ b/nemo_skills/dataset/librispeech-pc/prepare.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Prepare LibriSpeech-PC for ASR evaluation with punctuation and capitalization.
+
+LibriSpeech-PC provides manifests with punctuation/capitalization from OpenSLR-145.
+Audio files are downloaded from original LibriSpeech at OpenSLR-12.
+
+Usage:
+    ns prepare_data librispeech-pc --data_dir <path_to_data_dir>
+    ns prepare_data librispeech-pc --split test-clean (or test-other) --data_dir <path_to_data_dir>
+"""
+
+import argparse
+import json
+import os
+import shutil
+import sys
+import tarfile
+import urllib.request
+from pathlib import Path
+
+from tqdm import tqdm
+
+
+def download_with_progress(url: str, output_path: Path, desc: str):
+    """Download file with tqdm progress bar."""
+    with tqdm(unit="B", unit_scale=True, unit_divisor=1024, desc=desc) as pbar:
+
+        def reporthook(blocknum, blocksize, totalsize):
+            if pbar.total != totalsize:
+                pbar.total = totalsize
+            downloaded = blocknum * blocksize
+            pbar.update(max(0, downloaded - pbar.n))
+
+        urllib.request.urlretrieve(url, output_path, reporthook)
+
+
+# LibriSpeech-PC manifests (with punctuation and capitalization)
+MANIFESTS_URL = "https://www.openslr.org/resources/145/manifests.tar.gz"
+
+# Original LibriSpeech audio files
+AUDIO_URLS = {
+    "test-clean": "https://www.openslr.org/resources/12/test-clean.tar.gz",
+    "test-other": "https://www.openslr.org/resources/12/test-other.tar.gz",
+}
+
+
+def download_manifests(output_dir: Path) -> Path:
+    """Download LibriSpeech-PC manifests if not already present."""
+    if (output_dir / "test-clean.json").exists() and (output_dir / "test-other.json").exists():
+        return output_dir
+
+    tar_path = output_dir / "manifests.tar.gz"
+    download_with_progress(MANIFESTS_URL, tar_path, "Downloading manifests")
+
+    with tarfile.open(tar_path, "r:gz") as tar:
+        wanted = {"test-clean.json", "test-other.json"}
+        for member in tar.getmembers():
+            name = Path(member.name).name
+            if name not in wanted:
+                continue
+            fobj = tar.extractfile(member)
+            if fobj is None:
+                continue
+            out_path = output_dir / name
+            with open(out_path, "wb") as fout:
+                shutil.copyfileobj(fobj, fout)
+    os.remove(tar_path)
+
+    print("✓ Manifests ready\n")
+    return output_dir
+
+
+def download_audio(split: str, audio_dir: Path):
+    """Download LibriSpeech audio files if not already present."""
+    split_dir = audio_dir / "LibriSpeech" / split.replace("-", "_")
+    if split_dir.exists():
+        return
+
+    tar_path = audio_dir / f"{split}.tar.gz"
+    download_with_progress(AUDIO_URLS[split], tar_path, f"Downloading {split}")
+
+    with tarfile.open(tar_path, "r:gz") as tar:
+        if sys.version_info >= (3, 11, 4):
+            tar.extractall(audio_dir, filter="data")
+        else:
+            tar.extractall(audio_dir)
+    os.remove(tar_path)
+
+
+def process_split(split: str, data_dir: Path, audio_dir: Path, with_audio: bool) -> int:
+    """Process one LibriSpeech-PC split into nemo-skills format."""
+
+    output_file = data_dir / f"{split}.jsonl"
+    manifest_file = data_dir / f"{split}.json"
+    if not manifest_file.exists():
+        print(f"✗ Manifest not found: {manifest_file}")
+        return 0
+
+    if with_audio:
+        download_audio(split, audio_dir)
+
+    with open(manifest_file, "r") as f:
+        entries = [json.loads(line) for line in f if line.strip()]
+
+    processed = 0
+    skipped = 0
+
+    with open(output_file, "w") as fout:
+        for entry in entries:
+            audio_filepath = entry.get("audio_filepath", "")
+            text = entry.get("text", "")
+
+            if not audio_filepath or not text:
+                skipped += 1
+                continue
+
+            audio_id = Path(audio_filepath).stem
+
+            audio_root = os.getenv("NEMO_SKILLS_AUDIO_ROOT", "/data")
+            rel_audio_path = audio_filepath.lstrip("/")
+            if rel_audio_path.startswith("LibriSpeech/"):
+                rel_audio_path = rel_audio_path[len("LibriSpeech/") :]
+            container_path = f"{audio_root}/librispeech-pc/LibriSpeech/{rel_audio_path}"
+
+            user_message = {
+                "role": "user",
+                "content": "Transcribe the audio with proper punctuation and capitalization.",
+                "audio": {"path": container_path},
+            }
+
+            output_entry = {
+                "audio_filepath": container_path,
+                "text": text,
+                "expected_answer": text,
+                "task_type": "ASR-PC",
+                "sample_id": audio_id,
+                "split": split,
+                "messages": [{"role": "system", "content": "You are a helpful assistant. /no_think"}, user_message],
+            }
+
+            fout.write(json.dumps(output_entry, ensure_ascii=False) + "\n")
+            processed += 1
+
+    print(f"✓ {split}: {processed} samples" + (f" ({skipped} skipped)" if skipped > 0 else ""))
+
+    if processed > 0 and manifest_file.exists():
+        os.remove(manifest_file)
+
+    return processed
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Prepare LibriSpeech-PC for ASR evaluation")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default=os.getenv("NEMO_SKILLS_DATA_DIR"),
+        help=(
+            "Base data dir (defaults to $NEMO_SKILLS_DATA_DIR). "
+            "If provided, output goes under <data_dir>/librispeech-pc. "
+            "If omitted, writes into this package's dataset directory (only allowed outside site-packages)."
+        ),
+    )
+    parser.add_argument(
+        "--split",
+        default="all",
+        choices=["all", "test-clean", "test-other"],
+        help="Which split to prepare (default: all)",
+    )
+    parser.add_argument(
+        "--no-audio",
+        action="store_true",
+        help="Skip audio download",
+    )
+    args = parser.parse_args()
+
+    if args.data_dir:
+        data_dir = Path(args.data_dir) / "librispeech-pc"
+    else:
+        pkg_dir = Path(__file__).parent
+        pkg_dir_str = str(pkg_dir)
+        if "site-packages" in pkg_dir_str or "dist-packages" in pkg_dir_str:
+            raise SystemExit(
+                "Missing --data_dir and NEMO_SKILLS_DATA_DIR is not set. "
+                "Refusing to write into the installed package directory; please set NEMO_SKILLS_DATA_DIR "
+                "or pass --data_dir."
+            )
+        data_dir = pkg_dir
+
+    audio_dir = data_dir
+    audio_dir.mkdir(parents=True, exist_ok=True)
+
+    download_manifests(data_dir)
+
+    splits = ["test-clean", "test-other"] if args.split == "all" else [args.split]
+    total = sum(process_split(split, data_dir, audio_dir, not args.no_audio) for split in splits)
+
+    print(f"\n✓ Complete: {total} samples")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo_skills/evaluation/evaluator/audio.py b/nemo_skills/evaluation/evaluator/audio.py
index 7a087831a8..c212666311 100644
--- a/nemo_skills/evaluation/evaluator/audio.py
+++ b/nemo_skills/evaluation/evaluator/audio.py
@@ -325,13 +325,18 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
     generation = sample.get("generation", "").strip()
     expected_answer = sample.get("expected_answer", "").strip()
 
-    if task_type in ["ASR", "ASR-PC", "AST", "CER", "ASR_LEADERBOARD"] and not generation:
-        return {
+    if task_type in ["ASR", "ASR-PC", "ASR_LEADERBOARD", "AST", "Translation", "CER"] and not generation:
+        base = {
             "is_correct": False,
-            "wer": 1.0,
             "error": "missing_generation",
             "predicted_answer": "",
         }
+        if task_type in ["AST", "Translation"]:
+            return {**base, "bleu": 0.0}
+        if task_type == "CER":
+            return {**base, "cer": 1.0}
+        # ASR / ASR-PC / ASR_LEADERBOARD
+        return {**base, "wer": 1.0}
 
     if task_type == "ASR-PC":
         metrics = evaluate_asr_pc(
@@ -350,7 +355,7 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
         updates.update(metrics)
         updates["predicted_answer"] = generation
 
-    elif task_type == "AST":
+    elif task_type in ["AST", "Translation"]:
         metrics = evaluate_translation(expected_answer, generation)
         updates.update(metrics)
         updates["predicted_answer"] = generation
diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py
index a00a53f938..95a133833d 100644
--- a/nemo_skills/evaluation/metrics/audio_metrics.py
+++ b/nemo_skills/evaluation/metrics/audio_metrics.py
@@ -74,25 +74,43 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1):
         self.cap_accuracy_scores = []
         self.char_rate_scores = []
 
-    def _extract_judge_result(self, judgement_text: str) -> bool:
-        """Extract judge result from judgement text.
+        # Judge scores (AudioBench-style rating 0-5, or legacy binary Yes/No mapped to 1/0)
+        self.judge_ratings = []
 
-        Parses LLM judge output to determine if the response is correct.
+    def _extract_judge_result(self, judgement_text: str) -> tuple[bool, float]:
+        """Extract judge result from judgement text.
 
-        Args:
-            judgement_text: Text output from LLM judge
+        Supports two formats:
+        1. AudioBench format: 'Rating: X' where X is 0-5 (returns rating as float)
+        2. Legacy/binary format: 'Judgement: Yes/No' (mapped to 5.0/0.0 for consistent 0-100 scaling)
 
         Returns:
-            True if judge indicates correct, False otherwise
+            Tuple of (is_correct, rating_score)
+            - is_correct: True if rating >= 3 (or Yes for legacy)
+            - rating_score: 0-5 rating (or 0/5 for legacy binary)
         """
         import re
 
+        # Try AudioBench format first: 'Rating: X'
+        rating_match = re.search(r"Rating:\s*([0-9]+(?:\.[0-9]+)?)", judgement_text, re.IGNORECASE)
+        if rating_match:
+            rating = float(rating_match.group(1))
+            rating = max(0.0, min(5.0, rating))
+            return rating >= 3.0, rating
+
+        # Try explicit Judgement: Yes/No format
+        judgement_match = re.search(r"Judgement:\s*(Yes|No)", judgement_text, re.IGNORECASE)
+        if judgement_match:
+            is_yes = judgement_match.group(1).lower() == "yes"
+            return is_yes, 5.0 if is_yes else 0.0
+
+        # Last-resort: accept plain 'yes'/'no' anywhere in text
         if re.search(r"\byes\b", judgement_text, re.IGNORECASE):
-            return True
-        elif re.search(r"\bno\b", judgement_text, re.IGNORECASE):
-            return False
-        else:
-            return False
+            return True, 5.0
+        if re.search(r"\bno\b", judgement_text, re.IGNORECASE):
+            return False, 0.0
+
+        return False, 0.0
 
     def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         """Extract correctness scores from prediction.
@@ -111,8 +129,9 @@ def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         category = prediction.get("category", "unknown")
 
         if "judgement" in prediction and category == "open":
-            judge_result = self._extract_judge_result(prediction["judgement"])
-            score_dict["judge_correct"] = judge_result
+            judge_correct, judge_rating = self._extract_judge_result(prediction["judgement"])
+            score_dict["judge_correct"] = judge_correct
+            score_dict["judge_rating"] = judge_rating
 
         if category == "open" and "judge_correct" in score_dict:
             score_dict["correct"] = score_dict["judge_correct"]
@@ -194,6 +213,11 @@ def update(self, predictions):
             if "char_rate" in pred and pred["char_rate"] is not None:
                 self.char_rate_scores.append(pred["char_rate"])
 
+            # Collect judge ratings (0-5) from judge datasets if available
+            score_dict = self._get_score_dict(pred)
+            if "judge_rating" in score_dict:
+                self.judge_ratings.append(score_dict["judge_rating"])
+
         self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers)
         self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers)
 
@@ -219,6 +243,12 @@ def get_metrics(self):
             elif "judge_correct" in agg_metrics:
                 agg_metrics["success_rate"] = agg_metrics["judge_correct"]
 
+            # Add AudioBench-style judge_score if rating outputs were used.
+            # Formula: judge_score = mean(ratings) * 20 (converts 0-5 scale to 0-100)
+            if self.judge_ratings:
+                avg_rating = sum(self.judge_ratings) / len(self.judge_ratings)
+                agg_metrics["judge_score"] = avg_rating * 20
+
             # Add existing metrics: WER, PnC, and BLEU if available (convert to percentages and round to 2 decimals)
             if self.wer_scores:
                 agg_metrics["wer"] = round(100.0 * sum(self.wer_scores) / len(self.wer_scores), 2)
@@ -280,6 +310,10 @@ def metrics_to_print(self):
         if self.compute_no_answer:
             base_metrics["no_answer"] = as_percentage
 
+        # AudioBench-style judge_score (0-100, not a percent)
+        if self.judge_ratings:
+            base_metrics["judge_score"] = lambda _k, v, _all: f"{v:.2f}"
+
         # Add existing metrics if they were computed
         if self.wer_scores:
             base_metrics["wer"] = as_percentage
diff --git a/nemo_skills/pipeline/prepare_data.py b/nemo_skills/pipeline/prepare_data.py
index 36820c7337..8c3a58a8ba 100644
--- a/nemo_skills/pipeline/prepare_data.py
+++ b/nemo_skills/pipeline/prepare_data.py
@@ -31,7 +31,7 @@
 
 
 # TODO: read this from init.py
-DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "mmau-pro", "asr-leaderboard"]
+DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "mmau-pro", "librispeech-pc", "audiobench", "asr-leaderboard"]
 
 
 @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
diff --git a/nemo_skills/prompt/config/judge/audiobench.yaml b/nemo_skills/prompt/config/judge/audiobench.yaml
new file mode 100644
index 0000000000..62faa1acb2
--- /dev/null
+++ b/nemo_skills/prompt/config/judge/audiobench.yaml
@@ -0,0 +1,28 @@
+# Judge prompt configuration for AudioBench evaluation
+# Based on AudioBench's official production system with 0-5 rating scale
+
+
+user: |-
+  [Reference Answer]
+  {expected_answer}
+
+  [Model Answer]
+  {generation}
+
+  [Question]
+  {question}
+
+  [Task]
+  Rate the model's answer based on its alignment with the reference answer, focusing on accuracy and relevance to the reference provided. Please be critical on the details. If the model response is something like 'cannot decide', please rate as 0.
+  Criteria: Assess if the model's response mirrors the reference in terms of content, accuracy, and relevance.
+  Score0: The answer is refusing to give concrete results, providing something like 'cannot decide'.
+  Score0: The answer is completely misaligned, providing incorrect or irrelevant information compared to the reference.
+  Score1: The answer shows minimal alignment, often misunderstanding or providing irrelevant details unrelated to the reference.
+  Score2: The answer recognizes the topic but diverges significantly from the reference in accuracy or relevance.
+  Score3: The answer aligns with the reference generally but lacks detail or precise accuracy in some aspects.
+  Score4: The answer is mostly accurate and relevant, closely following the reference but could be clearer or more detailed.
+  Score5: The answer is highly accurate, detailed, and matches the reference answer perfectly, capturing its essence and detail.
+
+  Your response should be formatted as follows:
+  Explanation: (Provide a concise explanation of your rating, comparing the reference answer with the model's response. "The reference answer is [XXX], while the model's answer is [YYY]. I think ...")
+  Rating: (int)
diff --git a/nemo_skills/prompt/config/judge/audiobench_binary.yaml b/nemo_skills/prompt/config/judge/audiobench_binary.yaml
new file mode 100644
index 0000000000..d121607a48
--- /dev/null
+++ b/nemo_skills/prompt/config/judge/audiobench_binary.yaml
@@ -0,0 +1,29 @@
+# Judge prompt configuration for AudioBench evaluation
+# Based on AudioBench's official llama3_70b_as_judge_binary prompt
+# (Adapted to nemo-skills Yes/No format (instead of 0/1 Rating))
+
+user: |-
+  [Reference Answer]
+  {expected_answer}
+
+  [Model Answer]
+  {generation}
+
+  [Question]
+  {question}
+
+  [Task]
+  Rate the model's answer based on its alignment with the reference answer, focusing on accuracy and relevance to the reference provided. Please be critical on the details.
+
+  Criteria: Assess if the model's response mirrors the reference in terms of content, accuracy, and relevance.
+
+  The answer is INCORRECT if:
+  - The answer is refusing to give concrete results, providing something like 'cannot decide'
+  - The answer is wrong, providing incorrect or irrelevant information compared to the reference
+
+  The answer is CORRECT if:
+  - The answer is correct, capturing or covering the meaning from the reference
+
+  Your response should be formatted as follows:
+  Reasoning: (Provide a concise explanation of your rating, comparing the reference answer with the model's response. "The reference answer is [XXX], while the model's answer is [YYY]. I think ...")
+  Judgement: [Yes or No]
diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py
index aa5df51035..31c8f2cccf 100644
--- a/tests/gpu-tests/test_eval.py
+++ b/tests/gpu-tests/test_eval.py
@@ -45,6 +45,8 @@
     "mmau-pro",
     "asr-leaderboard",
     "aalcr",  # Has tokenization mismatch issues
+    "audiobench",
+    "librispeech-pc",
 }
 
 
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 39d4b0398a..86fd152df2 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -57,6 +57,8 @@
     ("college_math", ["test"]),
     ("comp-math-24-25", ["test"]),
     ("mmau-pro", ["test"]),
+    ("audiobench", ["test"]),
+    ("librispeech-pc", ["test"]),
 ]