From 3734ed38db696fda2661e097c59008a8059c0a3a Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Sun, 21 Dec 2025 11:36:53 -0800 Subject: [PATCH 1/8] add musan dataset Signed-off-by: George Zelenfroind --- nemo_skills/dataset/musan/__init__.py | 58 +++ nemo_skills/dataset/musan/prepare.py | 485 ++++++++++++++++++++++++++ 2 files changed, 543 insertions(+) create mode 100644 nemo_skills/dataset/musan/__init__.py create mode 100644 nemo_skills/dataset/musan/prepare.py diff --git a/nemo_skills/dataset/musan/__init__.py b/nemo_skills/dataset/musan/__init__.py new file mode 100644 index 0000000000..e91ef7c3b8 --- /dev/null +++ b/nemo_skills/dataset/musan/__init__.py @@ -0,0 +1,58 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MUSAN: A Music, Speech, and Noise Corpus + +MUSAN is a corpus of music, speech, and noise recordings designed for training +models for voice activity detection and music/speech discrimination. + +DOWNLOAD OPTIONS: + +1. HuggingFace (default - INCOMPLETE): + - 774 samples, 5h 4m total + - Noise: 728 samples (78% complete) + - Fast, no API key needed + +2. Kaggle (RECOMMENDED - COMPLETE): ✓ + - 10.3 GB, 2,016 WAV files + - Noise: 930 files (99.8% complete!) + - Music: 660 files, Speech: 426 files + - Requires Kaggle API key (one-time setup) + +3. OpenSLR (official - COMPLETE): + - 11 GB, full dataset + - No API key needed + +Reference: + David Snyder, Guoguo Chen, and Daniel Povey + "MUSAN: A Music, Speech, and Noise Corpus" + arXiv:1510.08484, 2015 +""" + +DATASET_GROUP = "speechlm" +IS_BENCHMARK_GROUP = True +SCORE_MODULE = "nemo_skills.evaluation.metrics.audio_metrics" +METRICS_TYPE = "audio" + +# Evaluation settings +EVAL_ARGS = "++eval_type=audio " + +# Generation settings - OpenAI format for audio-language models +GENERATION_ARGS = "++prompt_format=openai " + +# Benchmark - single test.jsonl contains all noise samples at top level +BENCHMARKS = { + "musan": {}, +} + diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py new file mode 100644 index 0000000000..1393d1c04b --- /dev/null +++ b/nemo_skills/dataset/musan/prepare.py @@ -0,0 +1,485 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MUSAN Dataset Preparation for nemo-skills + +Prepares the MUSAN dataset (Music, Speech, and Noise Corpus) for use with nemo-skills. + +Dataset sources: + - HuggingFace: 774 samples (~5h), incomplete, fast download + - Kaggle: 2016 files (10.3GB), nearly complete, requires API key + - OpenSLR: Complete dataset (11GB), official source + +Usage: + python -m nemo_skills.dataset.musan.prepare --source kaggle --categories noise + python -m nemo_skills.dataset.musan.prepare --categories noise --max-samples 100 + python -m nemo_skills.dataset.musan.prepare --source openslr --categories noise +""" + +import argparse +import json +import os +import tarfile +import urllib.request +from pathlib import Path +from typing import Dict, List, Optional + +import numpy as np +import soundfile as sf +from tqdm import tqdm + + +# HuggingFace dataset label mappings +CATEGORY_LABELS = { + "noise": 0, + "music": 1, +} + +LABEL_TO_CATEGORY = { + 0: "noise", + 1: "other", +} + + +def download_from_kaggle(output_dir: Path) -> Path: + """Download MUSAN dataset from Kaggle using kagglehub.""" + try: + import kagglehub + except ImportError: + raise ImportError("kagglehub not installed. Run: pip install kagglehub") + + print("Downloading from Kaggle (requires API key in ~/.kaggle/kaggle.json)") + + try: + path = kagglehub.dataset_download("dogrose/musan-dataset") + print(f"Downloaded to: {path}") + return Path(path) + except Exception as e: + raise Exception(f"Kaggle download failed: {e}") + + +def download_from_openslr(output_dir: Path) -> Path: + """Download MUSAN dataset from OpenSLR (11 GB).""" + url = "https://www.openslr.org/resources/17/musan.tar.gz" + download_path = output_dir / "musan.tar.gz" + extract_path = output_dir / "musan_openslr" + + print(f"Downloading from OpenSLR (~11 GB)") + print(f"URL: {url}") + + if not download_path.exists(): + def reporthook(block_num, block_size, total_size): + downloaded = block_num * block_size + percent = min(downloaded / total_size * 100, 100) + mb_downloaded = downloaded / (1024 * 1024) + mb_total = total_size / (1024 * 1024) + print(f"\r{percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end='') + + urllib.request.urlretrieve(url, download_path, reporthook) + print("\nDownload complete") + else: + print(f"Using cached archive: {download_path}") + + if not extract_path.exists(): + print(f"Extracting to {extract_path}...") + extract_path.mkdir(parents=True, exist_ok=True) + with tarfile.open(download_path, 'r:gz') as tar: + tar.extractall(extract_path) + print("Extraction complete") + else: + print(f"Using extracted data: {extract_path}") + + return extract_path / "musan" + + +def load_dataset_from_source(source: str, output_dir: Path): + """Load MUSAN dataset from specified source.""" + if source == "huggingface": + from datasets import load_dataset + print("Loading from HuggingFace...") + dataset = load_dataset("FluidInference/musan", split="train") + print(f"Loaded {len(dataset)} samples") + return dataset, "huggingface" + + elif source == "kaggle": + dataset_path = download_from_kaggle(output_dir) + musan_path = dataset_path / "musan" + if not musan_path.exists(): + raise ValueError(f"'musan' directory not found in {dataset_path}") + + print(f"Dataset path: {musan_path}") + for cat in ['music', 'speech', 'noise']: + cat_path = musan_path / cat + if cat_path.exists(): + wav_count = len(list(cat_path.glob("**/*.wav"))) + print(f" {cat}: {wav_count} files") + + return musan_path, "kaggle" + + elif source == "openslr": + dataset_path = download_from_openslr(output_dir) + print(f"Dataset path: {dataset_path}") + for cat in ['music', 'speech', 'noise']: + cat_path = dataset_path / cat + if cat_path.exists(): + wav_count = len(list(cat_path.glob("**/*.wav"))) + print(f" {cat}: {wav_count} files") + + return dataset_path, "openslr" + + else: + raise ValueError(f"Unknown source: {source}") + + +def get_audio_duration(audio_array: np.ndarray, sampling_rate: int) -> float: + """Compute audio duration in seconds.""" + if audio_array is None or len(audio_array) == 0: + return 0.0 + return float(len(audio_array) / sampling_rate) + + +def save_audio_file(audio_array: np.ndarray, sampling_rate: int, output_path: str): + """Save audio array to WAV file.""" + os.makedirs(os.path.dirname(output_path), exist_ok=True) + sf.write(output_path, audio_array, sampling_rate) + + +def create_manifest_entry( + audio_filename: str, + duration: float, + category: str, + sample_id: int, + label: str, +) -> Dict: + """Create nemo-skills manifest entry.""" + audio_rel_path = f"/data/musan/{category}/audio/{audio_filename}" + audio_metadata = {"path": audio_rel_path, "duration": duration} + + # Instruction for transcription (expects empty response for non-speech audio) + instruction = "Transcribe the speech in this audio. If there is no speech, do not output anything." + + entry = { + "audio_path": [audio_rel_path], + "messages": [ + {"role": "system", "content": "You are a helpful assistant. /no_think"}, + { + "role": "user", + "content": instruction, + "audio": audio_metadata, + "audios": [audio_metadata], + }, + ], + "expected_answer": "", + "dataset": "musan", + "subset_for_metrics": f"musan_{category}", + "sample_id": sample_id, + "category": category, + "original_label": label, + "task_type": "Hallucination", + "audio_duration": duration, + "question": instruction, + } + + return entry + + +def process_category_from_files( + category: str, + dataset_path: Path, + output_dir: Path, + save_audio: bool = True, + split: str = "train", + max_samples: int = -1, +) -> tuple[int, List[Dict]]: + """Process MUSAN category from WAV files (Kaggle/OpenSLR format).""" + category_path = dataset_path / category + if not category_path.exists(): + raise ValueError(f"Category directory not found: {category_path}") + + wav_files = sorted(list(category_path.glob("**/*.wav"))) + print(f"Found {len(wav_files)} WAV files") + + if len(wav_files) == 0: + return 0, [] + + if max_samples > 0 and len(wav_files) > max_samples: + wav_files = wav_files[:max_samples] + print(f"Limited to {max_samples} samples") + + audio_dir = output_dir / category / "audio" + dataset_dir = output_dir / category + os.makedirs(audio_dir, exist_ok=True) + os.makedirs(dataset_dir, exist_ok=True) + + manifest_entries = [] + successful = 0 + failed = 0 + + for idx, wav_path in enumerate(tqdm(wav_files, desc=f"Processing {category}")): + try: + audio_array, sampling_rate = sf.read(str(wav_path)) + duration = get_audio_duration(audio_array, sampling_rate) + audio_filename = f"musan_{category}_{idx:06d}.wav" + local_audio_path = audio_dir / audio_filename + + if save_audio: + try: + save_audio_file(audio_array, sampling_rate, str(local_audio_path)) + except Exception as e: + print(f"Failed to save sample {idx}: {e}") + failed += 1 + continue + + entry = create_manifest_entry( + audio_filename=audio_filename, + duration=duration, + category=category, + sample_id=idx, + label=wav_path.stem, + ) + + manifest_entries.append(entry) + successful += 1 + + except Exception as e: + print(f"Error processing {wav_path}: {e}") + failed += 1 + continue + + manifest_path = dataset_dir / "test.jsonl" + with open(manifest_path, "w", encoding="utf-8") as f: + for entry in manifest_entries: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + print(f"Saved {successful} samples to {manifest_path}") + if failed > 0: + print(f"Failed: {failed} samples") + + return successful, manifest_entries + + +def process_category( + category: str, + output_dir: Path, + dataset, + source_type: str, + save_audio: bool = True, + split: str = "train", + max_samples: int = -1, +) -> tuple[int, List[Dict]]: + """Process a single MUSAN category.""" + print(f"\n{'=' * 60}") + print(f"Processing: {category}") + print(f"{'=' * 60}") + + if source_type in ["kaggle", "openslr"]: + return process_category_from_files( + category=category, + dataset_path=dataset, + output_dir=output_dir, + save_audio=save_audio, + split=split, + max_samples=max_samples, + ) + + elif source_type != "huggingface": + raise NotImplementedError(f"Source '{source_type}' not supported") + + filtered_samples = [] + target_label = CATEGORY_LABELS.get(category) + if target_label is None: + print(f"Unknown category '{category}'") + return 0, [] + + for sample in dataset: + label = sample.get("label") + if label == target_label: + filtered_samples.append(sample) + + print(f"Found {len(filtered_samples)} samples") + + if len(filtered_samples) == 0: + return 0, [] + + if max_samples > 0 and len(filtered_samples) > max_samples: + filtered_samples = filtered_samples[:max_samples] + print(f"Limited to {max_samples} samples") + + # Create output directories + audio_dir = output_dir / category / "audio" + dataset_dir = output_dir / category + os.makedirs(audio_dir, exist_ok=True) + os.makedirs(dataset_dir, exist_ok=True) + + manifest_entries = [] + successful = 0 + failed = 0 + + for idx, sample in enumerate(tqdm(filtered_samples, desc=f"Processing {category}")): + try: + audio_dict = sample.get("audio") + if audio_dict is None: + failed += 1 + continue + + if isinstance(audio_dict, dict): + audio_array = audio_dict.get("array") + sampling_rate = audio_dict.get("sampling_rate", 16000) + else: + failed += 1 + continue + + if audio_array is None or len(audio_array) == 0: + failed += 1 + continue + + if isinstance(audio_array, list): + audio_array = np.array(audio_array) + + duration = get_audio_duration(audio_array, sampling_rate) + audio_filename = f"musan_{category}_{idx:06d}.wav" + local_audio_path = audio_dir / audio_filename + + if save_audio: + try: + save_audio_file(audio_array, sampling_rate, str(local_audio_path)) + except Exception as e: + print(f"Failed to save sample {idx}: {e}") + failed += 1 + continue + + label = sample.get("label", -1) + entry = create_manifest_entry( + audio_filename=audio_filename, + duration=duration, + category=category, + sample_id=idx, + label=str(label), + ) + + manifest_entries.append(entry) + successful += 1 + + except Exception as e: + print(f"Error processing sample {idx}: {e}") + failed += 1 + continue + + manifest_path = dataset_dir / "test.jsonl" + with open(manifest_path, "w", encoding="utf-8") as f: + for entry in manifest_entries: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + print(f"Saved {successful} samples to {manifest_path}") + if failed > 0: + print(f"Failed: {failed} samples") + + return successful, manifest_entries + + +def main(): + parser = argparse.ArgumentParser(description="Prepare MUSAN dataset for nemo-skills") + parser.add_argument( + "--source", + choices=["huggingface", "kaggle", "openslr"], + default="huggingface", + help="Download source: huggingface (fast, incomplete), kaggle (complete, API key), openslr (complete, 11GB)", + ) + parser.add_argument("--split", default="train", choices=["train", "validation", "test"]) + parser.add_argument("--output-dir", type=str, default=None) + parser.add_argument( + "--categories", + nargs="+", + choices=["music", "speech", "noise"], + default=["music", "speech", "noise"], + ) + parser.add_argument("--no-audio", dest="save_audio", action="store_false") + parser.add_argument("--max-samples", type=int, default=-1) + parser.set_defaults(save_audio=True) + + args = parser.parse_args() + + if args.output_dir: + output_dir = Path(args.output_dir) + else: + output_dir = Path(__file__).parent + + output_dir.mkdir(parents=True, exist_ok=True) + + print("\n" + "=" * 60) + print(f"MUSAN Dataset Preparation") + print("=" * 60) + print(f"Source: {args.source}") + print(f"Output: {output_dir}") + print(f"Categories: {', '.join(args.categories)}") + print("=" * 60 + "\n") + + try: + dataset, source_type = load_dataset_from_source(args.source, output_dir) + except Exception as e: + print(f"Failed to load dataset: {e}") + return + + total_samples = 0 + successful_categories = [] + failed_categories = [] + all_entries = [] + + for category in args.categories: + try: + num_samples, entries = process_category( + category=category, + output_dir=output_dir, + dataset=dataset, + source_type=source_type, + save_audio=args.save_audio, + split=args.split, + max_samples=args.max_samples, + ) + total_samples += num_samples + successful_categories.append(category) + all_entries.extend(entries) + + except Exception as e: + print(f"\nFailed: {category} - {e}\n") + failed_categories.append((category, str(e))) + + if all_entries: + combined_manifest_path = output_dir / "test.jsonl" + with open(combined_manifest_path, "w", encoding="utf-8") as f: + for entry in all_entries: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + print(f"\nCombined manifest: {combined_manifest_path}") + print(f"Total samples: {len(all_entries)}") + + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Requested: {len(args.categories)}, Successful: {len(successful_categories)}, Failed: {len(failed_categories)}") + print(f"Total samples: {total_samples}") + + if successful_categories: + for name in successful_categories: + print(f" ✓ {name}") + + if failed_categories: + for name, error in failed_categories: + print(f" ✗ {name}: {error}") + + print("=" * 60 + "\n") + + +if __name__ == "__main__": + main() + From 7b35079b0a6a875c78bda1a80264944d78563e25 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Mon, 22 Dec 2025 03:28:53 -0800 Subject: [PATCH 2/8] Pre commit and converting from chars/sec to chars/min Signed-off-by: George Zelenfroind --- nemo_skills/dataset/musan/__init__.py | 3 +- nemo_skills/dataset/musan/prepare.py | 86 ++++++++++--------- nemo_skills/evaluation/evaluator/audio.py | 14 +-- .../evaluation/metrics/audio_metrics.py | 4 +- 4 files changed, 56 insertions(+), 51 deletions(-) diff --git a/nemo_skills/dataset/musan/__init__.py b/nemo_skills/dataset/musan/__init__.py index e91ef7c3b8..2962ad75bf 100644 --- a/nemo_skills/dataset/musan/__init__.py +++ b/nemo_skills/dataset/musan/__init__.py @@ -14,7 +14,7 @@ """MUSAN: A Music, Speech, and Noise Corpus -MUSAN is a corpus of music, speech, and noise recordings designed for training +MUSAN is a corpus of music, speech, and noise recordings designed for training models for voice activity detection and music/speech discrimination. DOWNLOAD OPTIONS: @@ -55,4 +55,3 @@ BENCHMARKS = { "musan": {}, } - diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py index 1393d1c04b..c5367867c0 100644 --- a/nemo_skills/dataset/musan/prepare.py +++ b/nemo_skills/dataset/musan/prepare.py @@ -33,13 +33,12 @@ import tarfile import urllib.request from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List import numpy as np import soundfile as sf from tqdm import tqdm - # HuggingFace dataset label mappings CATEGORY_LABELS = { "noise": 0, @@ -58,9 +57,9 @@ def download_from_kaggle(output_dir: Path) -> Path: import kagglehub except ImportError: raise ImportError("kagglehub not installed. Run: pip install kagglehub") - + print("Downloading from Kaggle (requires API key in ~/.kaggle/kaggle.json)") - + try: path = kagglehub.dataset_download("dogrose/musan-dataset") print(f"Downloaded to: {path}") @@ -74,32 +73,33 @@ def download_from_openslr(output_dir: Path) -> Path: url = "https://www.openslr.org/resources/17/musan.tar.gz" download_path = output_dir / "musan.tar.gz" extract_path = output_dir / "musan_openslr" - - print(f"Downloading from OpenSLR (~11 GB)") + + print("Downloading from OpenSLR (~11 GB)") print(f"URL: {url}") - + if not download_path.exists(): + def reporthook(block_num, block_size, total_size): downloaded = block_num * block_size percent = min(downloaded / total_size * 100, 100) mb_downloaded = downloaded / (1024 * 1024) mb_total = total_size / (1024 * 1024) - print(f"\r{percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end='') - + print(f"\r{percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="") + urllib.request.urlretrieve(url, download_path, reporthook) print("\nDownload complete") else: print(f"Using cached archive: {download_path}") - + if not extract_path.exists(): print(f"Extracting to {extract_path}...") extract_path.mkdir(parents=True, exist_ok=True) - with tarfile.open(download_path, 'r:gz') as tar: + with tarfile.open(download_path, "r:gz") as tar: tar.extractall(extract_path) print("Extraction complete") else: print(f"Using extracted data: {extract_path}") - + return extract_path / "musan" @@ -107,37 +107,38 @@ def load_dataset_from_source(source: str, output_dir: Path): """Load MUSAN dataset from specified source.""" if source == "huggingface": from datasets import load_dataset + print("Loading from HuggingFace...") dataset = load_dataset("FluidInference/musan", split="train") print(f"Loaded {len(dataset)} samples") return dataset, "huggingface" - + elif source == "kaggle": dataset_path = download_from_kaggle(output_dir) musan_path = dataset_path / "musan" if not musan_path.exists(): raise ValueError(f"'musan' directory not found in {dataset_path}") - + print(f"Dataset path: {musan_path}") - for cat in ['music', 'speech', 'noise']: + for cat in ["music", "speech", "noise"]: cat_path = musan_path / cat if cat_path.exists(): wav_count = len(list(cat_path.glob("**/*.wav"))) print(f" {cat}: {wav_count} files") - + return musan_path, "kaggle" - + elif source == "openslr": dataset_path = download_from_openslr(output_dir) print(f"Dataset path: {dataset_path}") - for cat in ['music', 'speech', 'noise']: + for cat in ["music", "speech", "noise"]: cat_path = dataset_path / cat if cat_path.exists(): wav_count = len(list(cat_path.glob("**/*.wav"))) print(f" {cat}: {wav_count} files") - + return dataset_path, "openslr" - + else: raise ValueError(f"Unknown source: {source}") @@ -165,7 +166,7 @@ def create_manifest_entry( """Create nemo-skills manifest entry.""" audio_rel_path = f"/data/musan/{category}/audio/{audio_filename}" audio_metadata = {"path": audio_rel_path, "duration": duration} - + # Instruction for transcription (expects empty response for non-speech audio) instruction = "Transcribe the speech in this audio. If there is no speech, do not output anything." @@ -206,33 +207,33 @@ def process_category_from_files( category_path = dataset_path / category if not category_path.exists(): raise ValueError(f"Category directory not found: {category_path}") - + wav_files = sorted(list(category_path.glob("**/*.wav"))) print(f"Found {len(wav_files)} WAV files") - + if len(wav_files) == 0: return 0, [] - + if max_samples > 0 and len(wav_files) > max_samples: wav_files = wav_files[:max_samples] print(f"Limited to {max_samples} samples") - + audio_dir = output_dir / category / "audio" dataset_dir = output_dir / category os.makedirs(audio_dir, exist_ok=True) os.makedirs(dataset_dir, exist_ok=True) - + manifest_entries = [] successful = 0 failed = 0 - + for idx, wav_path in enumerate(tqdm(wav_files, desc=f"Processing {category}")): try: audio_array, sampling_rate = sf.read(str(wav_path)) duration = get_audio_duration(audio_array, sampling_rate) audio_filename = f"musan_{category}_{idx:06d}.wav" local_audio_path = audio_dir / audio_filename - + if save_audio: try: save_audio_file(audio_array, sampling_rate, str(local_audio_path)) @@ -240,7 +241,7 @@ def process_category_from_files( print(f"Failed to save sample {idx}: {e}") failed += 1 continue - + entry = create_manifest_entry( audio_filename=audio_filename, duration=duration, @@ -248,24 +249,24 @@ def process_category_from_files( sample_id=idx, label=wav_path.stem, ) - + manifest_entries.append(entry) successful += 1 - + except Exception as e: print(f"Error processing {wav_path}: {e}") failed += 1 continue - + manifest_path = dataset_dir / "test.jsonl" with open(manifest_path, "w", encoding="utf-8") as f: for entry in manifest_entries: f.write(json.dumps(entry, ensure_ascii=False) + "\n") - + print(f"Saved {successful} samples to {manifest_path}") if failed > 0: print(f"Failed: {failed} samples") - + return successful, manifest_entries @@ -292,7 +293,7 @@ def process_category( split=split, max_samples=max_samples, ) - + elif source_type != "huggingface": raise NotImplementedError(f"Source '{source_type}' not supported") @@ -301,14 +302,14 @@ def process_category( if target_label is None: print(f"Unknown category '{category}'") return 0, [] - + for sample in dataset: label = sample.get("label") if label == target_label: filtered_samples.append(sample) - + print(f"Found {len(filtered_samples)} samples") - + if len(filtered_samples) == 0: return 0, [] @@ -418,13 +419,13 @@ def main(): output_dir.mkdir(parents=True, exist_ok=True) print("\n" + "=" * 60) - print(f"MUSAN Dataset Preparation") + print("MUSAN Dataset Preparation") print("=" * 60) print(f"Source: {args.source}") print(f"Output: {output_dir}") print(f"Categories: {', '.join(args.categories)}") print("=" * 60 + "\n") - + try: dataset, source_type = load_dataset_from_source(args.source, output_dir) except Exception as e: @@ -466,7 +467,9 @@ def main(): print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) - print(f"Requested: {len(args.categories)}, Successful: {len(successful_categories)}, Failed: {len(failed_categories)}") + print( + f"Requested: {len(args.categories)}, Successful: {len(successful_categories)}, Failed: {len(failed_categories)}" + ) print(f"Total samples: {total_samples}") if successful_categories: @@ -482,4 +485,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/nemo_skills/evaluation/evaluator/audio.py b/nemo_skills/evaluation/evaluator/audio.py index c212666311..ff97181bbb 100644 --- a/nemo_skills/evaluation/evaluator/audio.py +++ b/nemo_skills/evaluation/evaluator/audio.py @@ -224,7 +224,7 @@ def evaluate_cer(reference: str, hypothesis: str) -> dict[str, Any]: def evaluate_hallucination(reference: str, hypothesis: str, audio_context: dict = None) -> dict[str, Any]: """Detect potential hallucinations via speaking rate anomaly. - Normal speech: ~10-15 chars/second. Higher rates suggest repetition/hallucination. + Normal speech: ~600-900 chars/minute. Higher rates suggest repetition/hallucination. Requires audio_duration in audio_context. """ audio_duration = audio_context.get("audio_duration") if audio_context else None @@ -238,10 +238,11 @@ def evaluate_hallucination(reference: str, hypothesis: str, audio_context: dict } char_count = len(hypothesis) - char_rate = char_count / audio_duration + # Convert to chars/minute + char_rate = (char_count / audio_duration) * 60.0 - # Hallucination threshold: >25 chars/sec (too fast = likely repetition) - is_hallucinating = char_rate > 25.0 + # Hallucination threshold: >1500 chars/min (25 chars/second * 60) + is_hallucinating = char_rate > 1500.0 return { "hallucination_rate": 1.0 if is_hallucinating else 0.0, @@ -385,8 +386,9 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic audio_duration = sample.get("audio_duration", None) if audio_duration and audio_duration > 0 and expected_answer and generation: - updates["ref_char_rate"] = len(expected_answer) / audio_duration - updates["hyp_char_rate"] = len(generation) / audio_duration + # chars/minute (chars/second * 60) + updates["ref_char_rate"] = (len(expected_answer) / audio_duration) * 60.0 + updates["hyp_char_rate"] = (len(generation) / audio_duration) * 60.0 updates["char_rate_diff"] = abs(updates["hyp_char_rate"] - updates["ref_char_rate"]) return updates diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py index 95a133833d..811d93a88a 100644 --- a/nemo_skills/evaluation/metrics/audio_metrics.py +++ b/nemo_skills/evaluation/metrics/audio_metrics.py @@ -72,6 +72,7 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1): self.pc_rate_scores = [] self.punct_f1_scores = [] self.cap_accuracy_scores = [] + # Stored as chars/minute (see evaluator/audio.py). self.char_rate_scores = [] # Judge scores (AudioBench-style rating 0-5, or legacy binary Yes/No mapped to 1/0) @@ -337,8 +338,9 @@ def metrics_to_print(self): base_metrics["punct_f1"] = as_percentage if self.cap_accuracy_scores: base_metrics["cap_accuracy"] = as_percentage + # char_rate is chars/minute (not a percent). if self.char_rate_scores: - base_metrics["char_rate"] = as_int + base_metrics["char_rate"] = lambda _k, v, _all: f"{v:.2f}" base_metrics["num_entries"] = as_int # Add at end for better display order From f8d71a67ff1b2d89ca86cf8e31c2feed7b3cb581 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Mon, 22 Dec 2025 06:42:30 -0800 Subject: [PATCH 3/8] pre commit, upd to mins Signed-off-by: George Zelenfroind --- .../evaluation/metrics/audio_metrics.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py index 811d93a88a..95f015542a 100644 --- a/nemo_skills/evaluation/metrics/audio_metrics.py +++ b/nemo_skills/evaluation/metrics/audio_metrics.py @@ -72,8 +72,8 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1): self.pc_rate_scores = [] self.punct_f1_scores = [] self.cap_accuracy_scores = [] - # Stored as chars/minute (see evaluator/audio.py). - self.char_rate_scores = [] + self.total_hallucinated_chars = 0 + self.total_audio_seconds = 0.0 # Judge scores (AudioBench-style rating 0-5, or legacy binary Yes/No mapped to 1/0) self.judge_ratings = [] @@ -211,8 +211,13 @@ def update(self, predictions): self.punct_f1_scores.append(pred["punct_f1"]) if "cap_accuracy" in pred and pred["cap_accuracy"] is not None: self.cap_accuracy_scores.append(pred["cap_accuracy"]) - if "char_rate" in pred and pred["char_rate"] is not None: - self.char_rate_scores.append(pred["char_rate"]) + + if pred.get("task_type") == "Hallucination": + predicted_text = pred.get("predicted_answer") or pred.get("generation") or "" + audio_duration = pred.get("audio_duration", 0.0) + if audio_duration > 0: + self.total_hallucinated_chars += len(predicted_text.strip()) + self.total_audio_seconds += audio_duration # Collect judge ratings (0-5) from judge datasets if available score_dict = self._get_score_dict(pred) @@ -277,8 +282,9 @@ def get_metrics(self): agg_metrics["cap_accuracy"] = round( 100.0 * sum(self.cap_accuracy_scores) / len(self.cap_accuracy_scores), 2 ) - if self.char_rate_scores: - agg_metrics["char_rate"] = round(sum(self.char_rate_scores) / len(self.char_rate_scores), 2) + if self.total_audio_seconds > 0: + total_minutes = self.total_audio_seconds / 60.0 + agg_metrics["char_rate"] = round(self.total_hallucinated_chars / total_minutes, 2) return metrics_dict @@ -338,8 +344,7 @@ def metrics_to_print(self): base_metrics["punct_f1"] = as_percentage if self.cap_accuracy_scores: base_metrics["cap_accuracy"] = as_percentage - # char_rate is chars/minute (not a percent). - if self.char_rate_scores: + if self.total_audio_seconds > 0: base_metrics["char_rate"] = lambda _k, v, _all: f"{v:.2f}" base_metrics["num_entries"] = as_int # Add at end for better display order From 15245ac71557885b2f93a5324eee734d97dd9fa1 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Mon, 22 Dec 2025 06:54:18 -0800 Subject: [PATCH 4/8] using standart float Signed-off-by: George Zelenfroind --- nemo_skills/evaluation/metrics/audio_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py index 95f015542a..7142f634fe 100644 --- a/nemo_skills/evaluation/metrics/audio_metrics.py +++ b/nemo_skills/evaluation/metrics/audio_metrics.py @@ -34,7 +34,7 @@ import logging -from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage +from nemo_skills.evaluation.metrics.base import BaseMetrics, as_float, as_int, as_percentage from nemo_skills.utils import get_logger_name LOG = logging.getLogger(get_logger_name(__file__)) @@ -345,7 +345,7 @@ def metrics_to_print(self): if self.cap_accuracy_scores: base_metrics["cap_accuracy"] = as_percentage if self.total_audio_seconds > 0: - base_metrics["char_rate"] = lambda _k, v, _all: f"{v:.2f}" + base_metrics["char_rate"] = as_float base_metrics["num_entries"] = as_int # Add at end for better display order From ed9d5e3d4b8ca34d43ad8da88be13d8ed52ea556 Mon Sep 17 00:00:00 2001 From: George <37293288+Jorjeous@users.noreply.github.com> Date: Thu, 8 Jan 2026 20:17:36 +0400 Subject: [PATCH 5/8] Update nemo_skills/dataset/musan/prepare.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com> --- nemo_skills/dataset/musan/prepare.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py index c5367867c0..02c6814d57 100644 --- a/nemo_skills/dataset/musan/prepare.py +++ b/nemo_skills/dataset/musan/prepare.py @@ -94,7 +94,9 @@ def reporthook(block_num, block_size, total_size): if not extract_path.exists(): print(f"Extracting to {extract_path}...") extract_path.mkdir(parents=True, exist_ok=True) - with tarfile.open(download_path, "r:gz") as tar: + if sys.version_info >= (3, 11, 4): + tar.extractall(extract_path, filter="data") + else: tar.extractall(extract_path) print("Extraction complete") else: From 73b80746d138926f2a5567c46d63f6efec36a2c6 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 8 Jan 2026 09:42:13 -0800 Subject: [PATCH 6/8] add to test and exclude Signed-off-by: George Zelenfroind --- nemo_skills/pipeline/prepare_data.py | 2 +- tests/gpu-tests/test_eval.py | 1 + tests/test_datasets.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_skills/pipeline/prepare_data.py b/nemo_skills/pipeline/prepare_data.py index 8c3a58a8ba..f4f8328d13 100644 --- a/nemo_skills/pipeline/prepare_data.py +++ b/nemo_skills/pipeline/prepare_data.py @@ -31,7 +31,7 @@ # TODO: read this from init.py -DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "mmau-pro", "librispeech-pc", "audiobench", "asr-leaderboard"] +DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "mmau-pro", "librispeech-pc", "audiobench", "asr-leaderboard", "musan"] @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True}) diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py index 47060a1368..f56a244289 100644 --- a/tests/gpu-tests/test_eval.py +++ b/tests/gpu-tests/test_eval.py @@ -46,6 +46,7 @@ "aalcr", # Has tokenization mismatch issues "audiobench", "librispeech-pc", + "musan", } diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 86fd152df2..f0c77675dc 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -59,6 +59,7 @@ ("mmau-pro", ["test"]), ("audiobench", ["test"]), ("librispeech-pc", ["test"]), + ("musan", ["test"]), ] From e51990f47192e9f0f5361dbd97b3c7167a90527c Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 8 Jan 2026 09:51:05 -0800 Subject: [PATCH 7/8] pre commit and revert git proposal Signed-off-by: George Zelenfroind --- nemo_skills/dataset/musan/prepare.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py index 02c6814d57..84026ee4c1 100644 --- a/nemo_skills/dataset/musan/prepare.py +++ b/nemo_skills/dataset/musan/prepare.py @@ -30,6 +30,7 @@ import argparse import json import os +import sys import tarfile import urllib.request from pathlib import Path @@ -94,10 +95,11 @@ def reporthook(block_num, block_size, total_size): if not extract_path.exists(): print(f"Extracting to {extract_path}...") extract_path.mkdir(parents=True, exist_ok=True) - if sys.version_info >= (3, 11, 4): - tar.extractall(extract_path, filter="data") - else: - tar.extractall(extract_path) + with tarfile.open(download_path, "r:gz") as tar: + if sys.version_info >= (3, 11, 4): + tar.extractall(extract_path, filter="data") + else: + tar.extractall(extract_path) print("Extraction complete") else: print(f"Using extracted data: {extract_path}") From c1b6cd8a511ec211537fd01963804512ec0ef9e3 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Mon, 12 Jan 2026 04:09:27 -0800 Subject: [PATCH 8/8] update hardcoded to NEMO_SKILLS_AUDIO_ROOT Signed-off-by: George Zelenfroind --- nemo_skills/dataset/musan/prepare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py index 84026ee4c1..8a735896ba 100644 --- a/nemo_skills/dataset/musan/prepare.py +++ b/nemo_skills/dataset/musan/prepare.py @@ -168,7 +168,8 @@ def create_manifest_entry( label: str, ) -> Dict: """Create nemo-skills manifest entry.""" - audio_rel_path = f"/data/musan/{category}/audio/{audio_filename}" + audio_root = os.getenv("NEMO_SKILLS_AUDIO_ROOT", "/data") + audio_rel_path = f"{audio_root}/musan/{category}/audio/{audio_filename}" audio_metadata = {"path": audio_rel_path, "duration": duration} # Instruction for transcription (expects empty response for non-speech audio)