From 3734ed38db696fda2661e097c59008a8059c0a3a Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Sun, 21 Dec 2025 11:36:53 -0800
Subject: [PATCH 1/8] add musan dataset

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/dataset/musan/__init__.py |  58 +++
 nemo_skills/dataset/musan/prepare.py  | 485 ++++++++++++++++++++++++++
 2 files changed, 543 insertions(+)
 create mode 100644 nemo_skills/dataset/musan/__init__.py
 create mode 100644 nemo_skills/dataset/musan/prepare.py

diff --git a/nemo_skills/dataset/musan/__init__.py b/nemo_skills/dataset/musan/__init__.py
new file mode 100644
index 0000000000..e91ef7c3b8
--- /dev/null
+++ b/nemo_skills/dataset/musan/__init__.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MUSAN: A Music, Speech, and Noise Corpus
+
+MUSAN is a corpus of music, speech, and noise recordings designed for training 
+models for voice activity detection and music/speech discrimination.
+
+DOWNLOAD OPTIONS:
+
+1. HuggingFace (default - INCOMPLETE):
+   - 774 samples, 5h 4m total
+   - Noise: 728 samples (78% complete)
+   - Fast, no API key needed
+
+2. Kaggle (RECOMMENDED - COMPLETE): ✓
+   - 10.3 GB, 2,016 WAV files
+   - Noise: 930 files (99.8% complete!)
+   - Music: 660 files, Speech: 426 files
+   - Requires Kaggle API key (one-time setup)
+
+3. OpenSLR (official - COMPLETE):
+   - 11 GB, full dataset
+   - No API key needed
+
+Reference:
+    David Snyder, Guoguo Chen, and Daniel Povey
+    "MUSAN: A Music, Speech, and Noise Corpus"
+    arXiv:1510.08484, 2015
+"""
+
+DATASET_GROUP = "speechlm"
+IS_BENCHMARK_GROUP = True
+SCORE_MODULE = "nemo_skills.evaluation.metrics.audio_metrics"
+METRICS_TYPE = "audio"
+
+# Evaluation settings
+EVAL_ARGS = "++eval_type=audio "
+
+# Generation settings - OpenAI format for audio-language models
+GENERATION_ARGS = "++prompt_format=openai "
+
+# Benchmark - single test.jsonl contains all noise samples at top level
+BENCHMARKS = {
+    "musan": {},
+}
+
diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py
new file mode 100644
index 0000000000..1393d1c04b
--- /dev/null
+++ b/nemo_skills/dataset/musan/prepare.py
@@ -0,0 +1,485 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MUSAN Dataset Preparation for nemo-skills
+
+Prepares the MUSAN dataset (Music, Speech, and Noise Corpus) for use with nemo-skills.
+
+Dataset sources:
+  - HuggingFace: 774 samples (~5h), incomplete, fast download
+  - Kaggle: 2016 files (10.3GB), nearly complete, requires API key
+  - OpenSLR: Complete dataset (11GB), official source
+
+Usage:
+    python -m nemo_skills.dataset.musan.prepare --source kaggle --categories noise
+    python -m nemo_skills.dataset.musan.prepare --categories noise --max-samples 100
+    python -m nemo_skills.dataset.musan.prepare --source openslr --categories noise
+"""
+
+import argparse
+import json
+import os
+import tarfile
+import urllib.request
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+
+
+# HuggingFace dataset label mappings
+CATEGORY_LABELS = {
+    "noise": 0,
+    "music": 1,
+}
+
+LABEL_TO_CATEGORY = {
+    0: "noise",
+    1: "other",
+}
+
+
+def download_from_kaggle(output_dir: Path) -> Path:
+    """Download MUSAN dataset from Kaggle using kagglehub."""
+    try:
+        import kagglehub
+    except ImportError:
+        raise ImportError("kagglehub not installed. Run: pip install kagglehub")
+    
+    print("Downloading from Kaggle (requires API key in ~/.kaggle/kaggle.json)")
+    
+    try:
+        path = kagglehub.dataset_download("dogrose/musan-dataset")
+        print(f"Downloaded to: {path}")
+        return Path(path)
+    except Exception as e:
+        raise Exception(f"Kaggle download failed: {e}")
+
+
+def download_from_openslr(output_dir: Path) -> Path:
+    """Download MUSAN dataset from OpenSLR (11 GB)."""
+    url = "https://www.openslr.org/resources/17/musan.tar.gz"
+    download_path = output_dir / "musan.tar.gz"
+    extract_path = output_dir / "musan_openslr"
+    
+    print(f"Downloading from OpenSLR (~11 GB)")
+    print(f"URL: {url}")
+    
+    if not download_path.exists():
+        def reporthook(block_num, block_size, total_size):
+            downloaded = block_num * block_size
+            percent = min(downloaded / total_size * 100, 100)
+            mb_downloaded = downloaded / (1024 * 1024)
+            mb_total = total_size / (1024 * 1024)
+            print(f"\r{percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end='')
+        
+        urllib.request.urlretrieve(url, download_path, reporthook)
+        print("\nDownload complete")
+    else:
+        print(f"Using cached archive: {download_path}")
+    
+    if not extract_path.exists():
+        print(f"Extracting to {extract_path}...")
+        extract_path.mkdir(parents=True, exist_ok=True)
+        with tarfile.open(download_path, 'r:gz') as tar:
+            tar.extractall(extract_path)
+        print("Extraction complete")
+    else:
+        print(f"Using extracted data: {extract_path}")
+    
+    return extract_path / "musan"
+
+
+def load_dataset_from_source(source: str, output_dir: Path):
+    """Load MUSAN dataset from specified source."""
+    if source == "huggingface":
+        from datasets import load_dataset
+        print("Loading from HuggingFace...")
+        dataset = load_dataset("FluidInference/musan", split="train")
+        print(f"Loaded {len(dataset)} samples")
+        return dataset, "huggingface"
+        
+    elif source == "kaggle":
+        dataset_path = download_from_kaggle(output_dir)
+        musan_path = dataset_path / "musan"
+        if not musan_path.exists():
+            raise ValueError(f"'musan' directory not found in {dataset_path}")
+        
+        print(f"Dataset path: {musan_path}")
+        for cat in ['music', 'speech', 'noise']:
+            cat_path = musan_path / cat
+            if cat_path.exists():
+                wav_count = len(list(cat_path.glob("**/*.wav")))
+                print(f"  {cat}: {wav_count} files")
+        
+        return musan_path, "kaggle"
+        
+    elif source == "openslr":
+        dataset_path = download_from_openslr(output_dir)
+        print(f"Dataset path: {dataset_path}")
+        for cat in ['music', 'speech', 'noise']:
+            cat_path = dataset_path / cat
+            if cat_path.exists():
+                wav_count = len(list(cat_path.glob("**/*.wav")))
+                print(f"  {cat}: {wav_count} files")
+        
+        return dataset_path, "openslr"
+        
+    else:
+        raise ValueError(f"Unknown source: {source}")
+
+
+def get_audio_duration(audio_array: np.ndarray, sampling_rate: int) -> float:
+    """Compute audio duration in seconds."""
+    if audio_array is None or len(audio_array) == 0:
+        return 0.0
+    return float(len(audio_array) / sampling_rate)
+
+
+def save_audio_file(audio_array: np.ndarray, sampling_rate: int, output_path: str):
+    """Save audio array to WAV file."""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    sf.write(output_path, audio_array, sampling_rate)
+
+
+def create_manifest_entry(
+    audio_filename: str,
+    duration: float,
+    category: str,
+    sample_id: int,
+    label: str,
+) -> Dict:
+    """Create nemo-skills manifest entry."""
+    audio_rel_path = f"/data/musan/{category}/audio/{audio_filename}"
+    audio_metadata = {"path": audio_rel_path, "duration": duration}
+    
+    # Instruction for transcription (expects empty response for non-speech audio)
+    instruction = "Transcribe the speech in this audio. If there is no speech, do not output anything."
+
+    entry = {
+        "audio_path": [audio_rel_path],
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant. /no_think"},
+            {
+                "role": "user",
+                "content": instruction,
+                "audio": audio_metadata,
+                "audios": [audio_metadata],
+            },
+        ],
+        "expected_answer": "",
+        "dataset": "musan",
+        "subset_for_metrics": f"musan_{category}",
+        "sample_id": sample_id,
+        "category": category,
+        "original_label": label,
+        "task_type": "Hallucination",
+        "audio_duration": duration,
+        "question": instruction,
+    }
+
+    return entry
+
+
+def process_category_from_files(
+    category: str,
+    dataset_path: Path,
+    output_dir: Path,
+    save_audio: bool = True,
+    split: str = "train",
+    max_samples: int = -1,
+) -> tuple[int, List[Dict]]:
+    """Process MUSAN category from WAV files (Kaggle/OpenSLR format)."""
+    category_path = dataset_path / category
+    if not category_path.exists():
+        raise ValueError(f"Category directory not found: {category_path}")
+    
+    wav_files = sorted(list(category_path.glob("**/*.wav")))
+    print(f"Found {len(wav_files)} WAV files")
+    
+    if len(wav_files) == 0:
+        return 0, []
+    
+    if max_samples > 0 and len(wav_files) > max_samples:
+        wav_files = wav_files[:max_samples]
+        print(f"Limited to {max_samples} samples")
+    
+    audio_dir = output_dir / category / "audio"
+    dataset_dir = output_dir / category
+    os.makedirs(audio_dir, exist_ok=True)
+    os.makedirs(dataset_dir, exist_ok=True)
+    
+    manifest_entries = []
+    successful = 0
+    failed = 0
+    
+    for idx, wav_path in enumerate(tqdm(wav_files, desc=f"Processing {category}")):
+        try:
+            audio_array, sampling_rate = sf.read(str(wav_path))
+            duration = get_audio_duration(audio_array, sampling_rate)
+            audio_filename = f"musan_{category}_{idx:06d}.wav"
+            local_audio_path = audio_dir / audio_filename
+            
+            if save_audio:
+                try:
+                    save_audio_file(audio_array, sampling_rate, str(local_audio_path))
+                except Exception as e:
+                    print(f"Failed to save sample {idx}: {e}")
+                    failed += 1
+                    continue
+            
+            entry = create_manifest_entry(
+                audio_filename=audio_filename,
+                duration=duration,
+                category=category,
+                sample_id=idx,
+                label=wav_path.stem,
+            )
+            
+            manifest_entries.append(entry)
+            successful += 1
+            
+        except Exception as e:
+            print(f"Error processing {wav_path}: {e}")
+            failed += 1
+            continue
+    
+    manifest_path = dataset_dir / "test.jsonl"
+    with open(manifest_path, "w", encoding="utf-8") as f:
+        for entry in manifest_entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    
+    print(f"Saved {successful} samples to {manifest_path}")
+    if failed > 0:
+        print(f"Failed: {failed} samples")
+    
+    return successful, manifest_entries
+
+
+def process_category(
+    category: str,
+    output_dir: Path,
+    dataset,
+    source_type: str,
+    save_audio: bool = True,
+    split: str = "train",
+    max_samples: int = -1,
+) -> tuple[int, List[Dict]]:
+    """Process a single MUSAN category."""
+    print(f"\n{'=' * 60}")
+    print(f"Processing: {category}")
+    print(f"{'=' * 60}")
+
+    if source_type in ["kaggle", "openslr"]:
+        return process_category_from_files(
+            category=category,
+            dataset_path=dataset,
+            output_dir=output_dir,
+            save_audio=save_audio,
+            split=split,
+            max_samples=max_samples,
+        )
+    
+    elif source_type != "huggingface":
+        raise NotImplementedError(f"Source '{source_type}' not supported")
+
+    filtered_samples = []
+    target_label = CATEGORY_LABELS.get(category)
+    if target_label is None:
+        print(f"Unknown category '{category}'")
+        return 0, []
+    
+    for sample in dataset:
+        label = sample.get("label")
+        if label == target_label:
+            filtered_samples.append(sample)
+    
+    print(f"Found {len(filtered_samples)} samples")
+    
+    if len(filtered_samples) == 0:
+        return 0, []
+
+    if max_samples > 0 and len(filtered_samples) > max_samples:
+        filtered_samples = filtered_samples[:max_samples]
+        print(f"Limited to {max_samples} samples")
+
+    # Create output directories
+    audio_dir = output_dir / category / "audio"
+    dataset_dir = output_dir / category
+    os.makedirs(audio_dir, exist_ok=True)
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    manifest_entries = []
+    successful = 0
+    failed = 0
+
+    for idx, sample in enumerate(tqdm(filtered_samples, desc=f"Processing {category}")):
+        try:
+            audio_dict = sample.get("audio")
+            if audio_dict is None:
+                failed += 1
+                continue
+
+            if isinstance(audio_dict, dict):
+                audio_array = audio_dict.get("array")
+                sampling_rate = audio_dict.get("sampling_rate", 16000)
+            else:
+                failed += 1
+                continue
+
+            if audio_array is None or len(audio_array) == 0:
+                failed += 1
+                continue
+
+            if isinstance(audio_array, list):
+                audio_array = np.array(audio_array)
+
+            duration = get_audio_duration(audio_array, sampling_rate)
+            audio_filename = f"musan_{category}_{idx:06d}.wav"
+            local_audio_path = audio_dir / audio_filename
+
+            if save_audio:
+                try:
+                    save_audio_file(audio_array, sampling_rate, str(local_audio_path))
+                except Exception as e:
+                    print(f"Failed to save sample {idx}: {e}")
+                    failed += 1
+                    continue
+
+            label = sample.get("label", -1)
+            entry = create_manifest_entry(
+                audio_filename=audio_filename,
+                duration=duration,
+                category=category,
+                sample_id=idx,
+                label=str(label),
+            )
+
+            manifest_entries.append(entry)
+            successful += 1
+
+        except Exception as e:
+            print(f"Error processing sample {idx}: {e}")
+            failed += 1
+            continue
+
+    manifest_path = dataset_dir / "test.jsonl"
+    with open(manifest_path, "w", encoding="utf-8") as f:
+        for entry in manifest_entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+    print(f"Saved {successful} samples to {manifest_path}")
+    if failed > 0:
+        print(f"Failed: {failed} samples")
+
+    return successful, manifest_entries
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Prepare MUSAN dataset for nemo-skills")
+    parser.add_argument(
+        "--source",
+        choices=["huggingface", "kaggle", "openslr"],
+        default="huggingface",
+        help="Download source: huggingface (fast, incomplete), kaggle (complete, API key), openslr (complete, 11GB)",
+    )
+    parser.add_argument("--split", default="train", choices=["train", "validation", "test"])
+    parser.add_argument("--output-dir", type=str, default=None)
+    parser.add_argument(
+        "--categories",
+        nargs="+",
+        choices=["music", "speech", "noise"],
+        default=["music", "speech", "noise"],
+    )
+    parser.add_argument("--no-audio", dest="save_audio", action="store_false")
+    parser.add_argument("--max-samples", type=int, default=-1)
+    parser.set_defaults(save_audio=True)
+
+    args = parser.parse_args()
+
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+    else:
+        output_dir = Path(__file__).parent
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print("\n" + "=" * 60)
+    print(f"MUSAN Dataset Preparation")
+    print("=" * 60)
+    print(f"Source: {args.source}")
+    print(f"Output: {output_dir}")
+    print(f"Categories: {', '.join(args.categories)}")
+    print("=" * 60 + "\n")
+    
+    try:
+        dataset, source_type = load_dataset_from_source(args.source, output_dir)
+    except Exception as e:
+        print(f"Failed to load dataset: {e}")
+        return
+
+    total_samples = 0
+    successful_categories = []
+    failed_categories = []
+    all_entries = []
+
+    for category in args.categories:
+        try:
+            num_samples, entries = process_category(
+                category=category,
+                output_dir=output_dir,
+                dataset=dataset,
+                source_type=source_type,
+                save_audio=args.save_audio,
+                split=args.split,
+                max_samples=args.max_samples,
+            )
+            total_samples += num_samples
+            successful_categories.append(category)
+            all_entries.extend(entries)
+
+        except Exception as e:
+            print(f"\nFailed: {category} - {e}\n")
+            failed_categories.append((category, str(e)))
+
+    if all_entries:
+        combined_manifest_path = output_dir / "test.jsonl"
+        with open(combined_manifest_path, "w", encoding="utf-8") as f:
+            for entry in all_entries:
+                f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+        print(f"\nCombined manifest: {combined_manifest_path}")
+        print(f"Total samples: {len(all_entries)}")
+
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"Requested: {len(args.categories)}, Successful: {len(successful_categories)}, Failed: {len(failed_categories)}")
+    print(f"Total samples: {total_samples}")
+
+    if successful_categories:
+        for name in successful_categories:
+            print(f"  ✓ {name}")
+
+    if failed_categories:
+        for name, error in failed_categories:
+            print(f"  ✗ {name}: {error}")
+
+    print("=" * 60 + "\n")
+
+
+if __name__ == "__main__":
+    main()
+

From 7b35079b0a6a875c78bda1a80264944d78563e25 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Mon, 22 Dec 2025 03:28:53 -0800
Subject: [PATCH 2/8] Pre commit and converting from chars/sec to chars/min

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/dataset/musan/__init__.py         |  3 +-
 nemo_skills/dataset/musan/prepare.py          | 86 ++++++++++---------
 nemo_skills/evaluation/evaluator/audio.py     | 14 +--
 .../evaluation/metrics/audio_metrics.py       |  4 +-
 4 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/nemo_skills/dataset/musan/__init__.py b/nemo_skills/dataset/musan/__init__.py
index e91ef7c3b8..2962ad75bf 100644
--- a/nemo_skills/dataset/musan/__init__.py
+++ b/nemo_skills/dataset/musan/__init__.py
@@ -14,7 +14,7 @@
 
 """MUSAN: A Music, Speech, and Noise Corpus
 
-MUSAN is a corpus of music, speech, and noise recordings designed for training 
+MUSAN is a corpus of music, speech, and noise recordings designed for training
 models for voice activity detection and music/speech discrimination.
 
 DOWNLOAD OPTIONS:
@@ -55,4 +55,3 @@
 BENCHMARKS = {
     "musan": {},
 }
-
diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py
index 1393d1c04b..c5367867c0 100644
--- a/nemo_skills/dataset/musan/prepare.py
+++ b/nemo_skills/dataset/musan/prepare.py
@@ -33,13 +33,12 @@
 import tarfile
 import urllib.request
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 import numpy as np
 import soundfile as sf
 from tqdm import tqdm
 
-
 # HuggingFace dataset label mappings
 CATEGORY_LABELS = {
     "noise": 0,
@@ -58,9 +57,9 @@ def download_from_kaggle(output_dir: Path) -> Path:
         import kagglehub
     except ImportError:
         raise ImportError("kagglehub not installed. Run: pip install kagglehub")
-    
+
     print("Downloading from Kaggle (requires API key in ~/.kaggle/kaggle.json)")
-    
+
     try:
         path = kagglehub.dataset_download("dogrose/musan-dataset")
         print(f"Downloaded to: {path}")
@@ -74,32 +73,33 @@ def download_from_openslr(output_dir: Path) -> Path:
     url = "https://www.openslr.org/resources/17/musan.tar.gz"
     download_path = output_dir / "musan.tar.gz"
     extract_path = output_dir / "musan_openslr"
-    
-    print(f"Downloading from OpenSLR (~11 GB)")
+
+    print("Downloading from OpenSLR (~11 GB)")
     print(f"URL: {url}")
-    
+
     if not download_path.exists():
+
         def reporthook(block_num, block_size, total_size):
             downloaded = block_num * block_size
             percent = min(downloaded / total_size * 100, 100)
             mb_downloaded = downloaded / (1024 * 1024)
             mb_total = total_size / (1024 * 1024)
-            print(f"\r{percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end='')
-        
+            print(f"\r{percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="")
+
         urllib.request.urlretrieve(url, download_path, reporthook)
         print("\nDownload complete")
     else:
         print(f"Using cached archive: {download_path}")
-    
+
     if not extract_path.exists():
         print(f"Extracting to {extract_path}...")
         extract_path.mkdir(parents=True, exist_ok=True)
-        with tarfile.open(download_path, 'r:gz') as tar:
+        with tarfile.open(download_path, "r:gz") as tar:
             tar.extractall(extract_path)
         print("Extraction complete")
     else:
         print(f"Using extracted data: {extract_path}")
-    
+
     return extract_path / "musan"
 
 
@@ -107,37 +107,38 @@ def load_dataset_from_source(source: str, output_dir: Path):
     """Load MUSAN dataset from specified source."""
     if source == "huggingface":
         from datasets import load_dataset
+
         print("Loading from HuggingFace...")
         dataset = load_dataset("FluidInference/musan", split="train")
         print(f"Loaded {len(dataset)} samples")
         return dataset, "huggingface"
-        
+
     elif source == "kaggle":
         dataset_path = download_from_kaggle(output_dir)
         musan_path = dataset_path / "musan"
         if not musan_path.exists():
             raise ValueError(f"'musan' directory not found in {dataset_path}")
-        
+
         print(f"Dataset path: {musan_path}")
-        for cat in ['music', 'speech', 'noise']:
+        for cat in ["music", "speech", "noise"]:
             cat_path = musan_path / cat
             if cat_path.exists():
                 wav_count = len(list(cat_path.glob("**/*.wav")))
                 print(f"  {cat}: {wav_count} files")
-        
+
         return musan_path, "kaggle"
-        
+
     elif source == "openslr":
         dataset_path = download_from_openslr(output_dir)
         print(f"Dataset path: {dataset_path}")
-        for cat in ['music', 'speech', 'noise']:
+        for cat in ["music", "speech", "noise"]:
             cat_path = dataset_path / cat
             if cat_path.exists():
                 wav_count = len(list(cat_path.glob("**/*.wav")))
                 print(f"  {cat}: {wav_count} files")
-        
+
         return dataset_path, "openslr"
-        
+
     else:
         raise ValueError(f"Unknown source: {source}")
 
@@ -165,7 +166,7 @@ def create_manifest_entry(
     """Create nemo-skills manifest entry."""
     audio_rel_path = f"/data/musan/{category}/audio/{audio_filename}"
     audio_metadata = {"path": audio_rel_path, "duration": duration}
-    
+
     # Instruction for transcription (expects empty response for non-speech audio)
     instruction = "Transcribe the speech in this audio. If there is no speech, do not output anything."
 
@@ -206,33 +207,33 @@ def process_category_from_files(
     category_path = dataset_path / category
     if not category_path.exists():
         raise ValueError(f"Category directory not found: {category_path}")
-    
+
     wav_files = sorted(list(category_path.glob("**/*.wav")))
     print(f"Found {len(wav_files)} WAV files")
-    
+
     if len(wav_files) == 0:
         return 0, []
-    
+
     if max_samples > 0 and len(wav_files) > max_samples:
         wav_files = wav_files[:max_samples]
         print(f"Limited to {max_samples} samples")
-    
+
     audio_dir = output_dir / category / "audio"
     dataset_dir = output_dir / category
     os.makedirs(audio_dir, exist_ok=True)
     os.makedirs(dataset_dir, exist_ok=True)
-    
+
     manifest_entries = []
     successful = 0
     failed = 0
-    
+
     for idx, wav_path in enumerate(tqdm(wav_files, desc=f"Processing {category}")):
         try:
             audio_array, sampling_rate = sf.read(str(wav_path))
             duration = get_audio_duration(audio_array, sampling_rate)
             audio_filename = f"musan_{category}_{idx:06d}.wav"
             local_audio_path = audio_dir / audio_filename
-            
+
             if save_audio:
                 try:
                     save_audio_file(audio_array, sampling_rate, str(local_audio_path))
@@ -240,7 +241,7 @@ def process_category_from_files(
                     print(f"Failed to save sample {idx}: {e}")
                     failed += 1
                     continue
-            
+
             entry = create_manifest_entry(
                 audio_filename=audio_filename,
                 duration=duration,
@@ -248,24 +249,24 @@ def process_category_from_files(
                 sample_id=idx,
                 label=wav_path.stem,
             )
-            
+
             manifest_entries.append(entry)
             successful += 1
-            
+
         except Exception as e:
             print(f"Error processing {wav_path}: {e}")
             failed += 1
             continue
-    
+
     manifest_path = dataset_dir / "test.jsonl"
     with open(manifest_path, "w", encoding="utf-8") as f:
         for entry in manifest_entries:
             f.write(json.dumps(entry, ensure_ascii=False) + "\n")
-    
+
     print(f"Saved {successful} samples to {manifest_path}")
     if failed > 0:
         print(f"Failed: {failed} samples")
-    
+
     return successful, manifest_entries
 
 
@@ -292,7 +293,7 @@ def process_category(
             split=split,
             max_samples=max_samples,
         )
-    
+
     elif source_type != "huggingface":
         raise NotImplementedError(f"Source '{source_type}' not supported")
 
@@ -301,14 +302,14 @@ def process_category(
     if target_label is None:
         print(f"Unknown category '{category}'")
         return 0, []
-    
+
     for sample in dataset:
         label = sample.get("label")
         if label == target_label:
             filtered_samples.append(sample)
-    
+
     print(f"Found {len(filtered_samples)} samples")
-    
+
     if len(filtered_samples) == 0:
         return 0, []
 
@@ -418,13 +419,13 @@ def main():
     output_dir.mkdir(parents=True, exist_ok=True)
 
     print("\n" + "=" * 60)
-    print(f"MUSAN Dataset Preparation")
+    print("MUSAN Dataset Preparation")
     print("=" * 60)
     print(f"Source: {args.source}")
     print(f"Output: {output_dir}")
     print(f"Categories: {', '.join(args.categories)}")
     print("=" * 60 + "\n")
-    
+
     try:
         dataset, source_type = load_dataset_from_source(args.source, output_dir)
     except Exception as e:
@@ -466,7 +467,9 @@ def main():
     print("\n" + "=" * 60)
     print("SUMMARY")
     print("=" * 60)
-    print(f"Requested: {len(args.categories)}, Successful: {len(successful_categories)}, Failed: {len(failed_categories)}")
+    print(
+        f"Requested: {len(args.categories)}, Successful: {len(successful_categories)}, Failed: {len(failed_categories)}"
+    )
     print(f"Total samples: {total_samples}")
 
     if successful_categories:
@@ -482,4 +485,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/nemo_skills/evaluation/evaluator/audio.py b/nemo_skills/evaluation/evaluator/audio.py
index c212666311..ff97181bbb 100644
--- a/nemo_skills/evaluation/evaluator/audio.py
+++ b/nemo_skills/evaluation/evaluator/audio.py
@@ -224,7 +224,7 @@ def evaluate_cer(reference: str, hypothesis: str) -> dict[str, Any]:
 def evaluate_hallucination(reference: str, hypothesis: str, audio_context: dict = None) -> dict[str, Any]:
     """Detect potential hallucinations via speaking rate anomaly.
 
-    Normal speech: ~10-15 chars/second. Higher rates suggest repetition/hallucination.
+    Normal speech: ~600-900 chars/minute. Higher rates suggest repetition/hallucination.
     Requires audio_duration in audio_context.
     """
     audio_duration = audio_context.get("audio_duration") if audio_context else None
@@ -238,10 +238,11 @@ def evaluate_hallucination(reference: str, hypothesis: str, audio_context: dict
         }
 
     char_count = len(hypothesis)
-    char_rate = char_count / audio_duration
+    # Convert to chars/minute
+    char_rate = (char_count / audio_duration) * 60.0
 
-    # Hallucination threshold: >25 chars/sec (too fast = likely repetition)
-    is_hallucinating = char_rate > 25.0
+    # Hallucination threshold: >1500 chars/min (25 chars/second * 60)
+    is_hallucinating = char_rate > 1500.0
 
     return {
         "hallucination_rate": 1.0 if is_hallucinating else 0.0,
@@ -385,8 +386,9 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
 
     audio_duration = sample.get("audio_duration", None)
     if audio_duration and audio_duration > 0 and expected_answer and generation:
-        updates["ref_char_rate"] = len(expected_answer) / audio_duration
-        updates["hyp_char_rate"] = len(generation) / audio_duration
+        # chars/minute (chars/second * 60)
+        updates["ref_char_rate"] = (len(expected_answer) / audio_duration) * 60.0
+        updates["hyp_char_rate"] = (len(generation) / audio_duration) * 60.0
         updates["char_rate_diff"] = abs(updates["hyp_char_rate"] - updates["ref_char_rate"])
 
     return updates
diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py
index 95a133833d..811d93a88a 100644
--- a/nemo_skills/evaluation/metrics/audio_metrics.py
+++ b/nemo_skills/evaluation/metrics/audio_metrics.py
@@ -72,6 +72,7 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1):
         self.pc_rate_scores = []
         self.punct_f1_scores = []
         self.cap_accuracy_scores = []
+        # Stored as chars/minute (see evaluator/audio.py).
         self.char_rate_scores = []
 
         # Judge scores (AudioBench-style rating 0-5, or legacy binary Yes/No mapped to 1/0)
@@ -337,8 +338,9 @@ def metrics_to_print(self):
             base_metrics["punct_f1"] = as_percentage
         if self.cap_accuracy_scores:
             base_metrics["cap_accuracy"] = as_percentage
+        # char_rate is chars/minute (not a percent).
         if self.char_rate_scores:
-            base_metrics["char_rate"] = as_int
+            base_metrics["char_rate"] = lambda _k, v, _all: f"{v:.2f}"
 
         base_metrics["num_entries"] = as_int  # Add at end for better display order
 

From f8d71a67ff1b2d89ca86cf8e31c2feed7b3cb581 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Mon, 22 Dec 2025 06:42:30 -0800
Subject: [PATCH 3/8] pre commit, upd to mins

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .../evaluation/metrics/audio_metrics.py       | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py
index 811d93a88a..95f015542a 100644
--- a/nemo_skills/evaluation/metrics/audio_metrics.py
+++ b/nemo_skills/evaluation/metrics/audio_metrics.py
@@ -72,8 +72,8 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1):
         self.pc_rate_scores = []
         self.punct_f1_scores = []
         self.cap_accuracy_scores = []
-        # Stored as chars/minute (see evaluator/audio.py).
-        self.char_rate_scores = []
+        self.total_hallucinated_chars = 0
+        self.total_audio_seconds = 0.0
 
         # Judge scores (AudioBench-style rating 0-5, or legacy binary Yes/No mapped to 1/0)
         self.judge_ratings = []
@@ -211,8 +211,13 @@ def update(self, predictions):
                 self.punct_f1_scores.append(pred["punct_f1"])
             if "cap_accuracy" in pred and pred["cap_accuracy"] is not None:
                 self.cap_accuracy_scores.append(pred["cap_accuracy"])
-            if "char_rate" in pred and pred["char_rate"] is not None:
-                self.char_rate_scores.append(pred["char_rate"])
+
+            if pred.get("task_type") == "Hallucination":
+                predicted_text = pred.get("predicted_answer") or pred.get("generation") or ""
+                audio_duration = pred.get("audio_duration", 0.0)
+                if audio_duration > 0:
+                    self.total_hallucinated_chars += len(predicted_text.strip())
+                    self.total_audio_seconds += audio_duration
 
             # Collect judge ratings (0-5) from judge datasets if available
             score_dict = self._get_score_dict(pred)
@@ -277,8 +282,9 @@ def get_metrics(self):
                 agg_metrics["cap_accuracy"] = round(
                     100.0 * sum(self.cap_accuracy_scores) / len(self.cap_accuracy_scores), 2
                 )
-            if self.char_rate_scores:
-                agg_metrics["char_rate"] = round(sum(self.char_rate_scores) / len(self.char_rate_scores), 2)
+            if self.total_audio_seconds > 0:
+                total_minutes = self.total_audio_seconds / 60.0
+                agg_metrics["char_rate"] = round(self.total_hallucinated_chars / total_minutes, 2)
 
         return metrics_dict
 
@@ -338,8 +344,7 @@ def metrics_to_print(self):
             base_metrics["punct_f1"] = as_percentage
         if self.cap_accuracy_scores:
             base_metrics["cap_accuracy"] = as_percentage
-        # char_rate is chars/minute (not a percent).
-        if self.char_rate_scores:
+        if self.total_audio_seconds > 0:
             base_metrics["char_rate"] = lambda _k, v, _all: f"{v:.2f}"
 
         base_metrics["num_entries"] = as_int  # Add at end for better display order

From 15245ac71557885b2f93a5324eee734d97dd9fa1 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Mon, 22 Dec 2025 06:54:18 -0800
Subject: [PATCH 4/8] using standart float

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/evaluation/metrics/audio_metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py
index 95f015542a..7142f634fe 100644
--- a/nemo_skills/evaluation/metrics/audio_metrics.py
+++ b/nemo_skills/evaluation/metrics/audio_metrics.py
@@ -34,7 +34,7 @@
 
 import logging
 
-from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage
+from nemo_skills.evaluation.metrics.base import BaseMetrics, as_float, as_int, as_percentage
 from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
@@ -345,7 +345,7 @@ def metrics_to_print(self):
         if self.cap_accuracy_scores:
             base_metrics["cap_accuracy"] = as_percentage
         if self.total_audio_seconds > 0:
-            base_metrics["char_rate"] = lambda _k, v, _all: f"{v:.2f}"
+            base_metrics["char_rate"] = as_float
 
         base_metrics["num_entries"] = as_int  # Add at end for better display order
 

From ed9d5e3d4b8ca34d43ad8da88be13d8ed52ea556 Mon Sep 17 00:00:00 2001
From: George <37293288+Jorjeous@users.noreply.github.com>
Date: Thu, 8 Jan 2026 20:17:36 +0400
Subject: [PATCH 5/8] Update nemo_skills/dataset/musan/prepare.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com>
---
 nemo_skills/dataset/musan/prepare.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py
index c5367867c0..02c6814d57 100644
--- a/nemo_skills/dataset/musan/prepare.py
+++ b/nemo_skills/dataset/musan/prepare.py
@@ -94,7 +94,9 @@ def reporthook(block_num, block_size, total_size):
     if not extract_path.exists():
         print(f"Extracting to {extract_path}...")
         extract_path.mkdir(parents=True, exist_ok=True)
-        with tarfile.open(download_path, "r:gz") as tar:
+        if sys.version_info >= (3, 11, 4):
+            tar.extractall(extract_path, filter="data")
+        else:
             tar.extractall(extract_path)
         print("Extraction complete")
     else:

From 73b80746d138926f2a5567c46d63f6efec36a2c6 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 8 Jan 2026 09:42:13 -0800
Subject: [PATCH 6/8] add to test and exclude

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/pipeline/prepare_data.py | 2 +-
 tests/gpu-tests/test_eval.py         | 1 +
 tests/test_datasets.py               | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/pipeline/prepare_data.py b/nemo_skills/pipeline/prepare_data.py
index 8c3a58a8ba..f4f8328d13 100644
--- a/nemo_skills/pipeline/prepare_data.py
+++ b/nemo_skills/pipeline/prepare_data.py
@@ -31,7 +31,7 @@
 
 
 # TODO: read this from init.py
-DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "mmau-pro", "librispeech-pc", "audiobench", "asr-leaderboard"]
+DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "mmau-pro", "librispeech-pc", "audiobench", "asr-leaderboard", "musan"]
 
 
 @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py
index 47060a1368..f56a244289 100644
--- a/tests/gpu-tests/test_eval.py
+++ b/tests/gpu-tests/test_eval.py
@@ -46,6 +46,7 @@
     "aalcr",  # Has tokenization mismatch issues
     "audiobench",
     "librispeech-pc",
+    "musan",
 }
 
 
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 86fd152df2..f0c77675dc 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -59,6 +59,7 @@
     ("mmau-pro", ["test"]),
     ("audiobench", ["test"]),
     ("librispeech-pc", ["test"]),
+    ("musan", ["test"]),
 ]
 
 

From e51990f47192e9f0f5361dbd97b3c7167a90527c Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 8 Jan 2026 09:51:05 -0800
Subject: [PATCH 7/8] pre commit and revert git proposal

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/dataset/musan/prepare.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py
index 02c6814d57..84026ee4c1 100644
--- a/nemo_skills/dataset/musan/prepare.py
+++ b/nemo_skills/dataset/musan/prepare.py
@@ -30,6 +30,7 @@
 import argparse
 import json
 import os
+import sys
 import tarfile
 import urllib.request
 from pathlib import Path
@@ -94,10 +95,11 @@ def reporthook(block_num, block_size, total_size):
     if not extract_path.exists():
         print(f"Extracting to {extract_path}...")
         extract_path.mkdir(parents=True, exist_ok=True)
-        if sys.version_info >= (3, 11, 4):
-            tar.extractall(extract_path, filter="data")
-        else:
-            tar.extractall(extract_path)
+        with tarfile.open(download_path, "r:gz") as tar:
+            if sys.version_info >= (3, 11, 4):
+                tar.extractall(extract_path, filter="data")
+            else:
+                tar.extractall(extract_path)
         print("Extraction complete")
     else:
         print(f"Using extracted data: {extract_path}")

From c1b6cd8a511ec211537fd01963804512ec0ef9e3 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Mon, 12 Jan 2026 04:09:27 -0800
Subject: [PATCH 8/8] update hardcoded to NEMO_SKILLS_AUDIO_ROOT

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/dataset/musan/prepare.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/dataset/musan/prepare.py b/nemo_skills/dataset/musan/prepare.py
index 84026ee4c1..8a735896ba 100644
--- a/nemo_skills/dataset/musan/prepare.py
+++ b/nemo_skills/dataset/musan/prepare.py
@@ -168,7 +168,8 @@ def create_manifest_entry(
     label: str,
 ) -> Dict:
     """Create nemo-skills manifest entry."""
-    audio_rel_path = f"/data/musan/{category}/audio/{audio_filename}"
+    audio_root = os.getenv("NEMO_SKILLS_AUDIO_ROOT", "/data")
+    audio_rel_path = f"{audio_root}/musan/{category}/audio/{audio_filename}"
     audio_metadata = {"path": audio_rel_path, "duration": duration}
 
     # Instruction for transcription (expects empty response for non-speech audio)