From dc025e3d559e8796ae130ee6c4ab82d63a5a9a60 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sat, 31 Jan 2026 14:23:06 -0800 Subject: [PATCH 1/3] Add emergent_tts dataset + eval scripts Introduce the emergent_tts dataset package with prepare/generate/score helpers and default configs to run EmergentTTS evaluation via NeMo-Skills. Co-authored-by: Cursor --- nemo_skills/dataset/emergent_tts/__init__.py | 6 + .../dataset/emergent_tts/emergent/__init__.py | 3 + nemo_skills/dataset/emergent_tts/prepare.py | 238 +++++++++++++++++ .../dataset/emergent_tts/scripts/__init__.py | 2 + .../emergent_tts/scripts/check_deps.py | 95 +++++++ .../emergent_tts/scripts/config/default.yaml | 61 +++++ .../scripts/config/interactive_10.yaml | 22 ++ .../scripts/config/local_interactive_10.yaml | 30 +++ .../config/local_interactive_10_base.yaml | 26 ++ .../scripts/convert_ns_outputs_to_emergent.py | 92 +++++++ .../emergent_tts/scripts/run_tts_eval.py | 168 ++++++++++++ .../dataset/emergent_tts/scripts/score.py | 252 ++++++++++++++++++ 12 files changed, 995 insertions(+) create mode 100644 nemo_skills/dataset/emergent_tts/__init__.py create mode 100644 nemo_skills/dataset/emergent_tts/emergent/__init__.py create mode 100644 nemo_skills/dataset/emergent_tts/prepare.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/__init__.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/check_deps.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/default.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/score.py diff --git a/nemo_skills/dataset/emergent_tts/__init__.py b/nemo_skills/dataset/emergent_tts/__init__.py new file mode 100644 index 0000000000..c95f451485 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/__init__.py @@ -0,0 +1,6 @@ +"""EmergentTTS-Eval dataset integration for NeMo-Skills. + +This package contains tooling to prepare the EmergentTTS-Eval benchmark for +NeMo-Skills evaluation runs. +""" + diff --git a/nemo_skills/dataset/emergent_tts/emergent/__init__.py b/nemo_skills/dataset/emergent_tts/emergent/__init__.py new file mode 100644 index 0000000000..13edac0edf --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/emergent/__init__.py @@ -0,0 +1,3 @@ +# EmergentTTS-Eval benchmark (NeMo-Skills) + +GENERATION_ARGS = "++prompt_format=openai" diff --git a/nemo_skills/dataset/emergent_tts/prepare.py b/nemo_skills/dataset/emergent_tts/prepare.py new file mode 100644 index 0000000000..f3616cc2d8 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/prepare.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Prepare EmergentTTS-Eval benchmark for NeMo-Skills. + +This script: +1) Downloads the EmergentTTS-Eval HF dataset +2) Saves baseline audios to wav files +3) Writes `data/emergent_tts_eval_data.jsonl` in Emergent's expected schema +4) Downloads `data/wv_mos.ckpt` +5) Writes NeMo-Skills `test.jsonl` for generation (OpenAI prompt format) + +Typical usage (to create everything under your shared NeMo-Skills data dir): + python prepare.py --output_dir /lustre/.../data_dir/emergent_tts +""" + +from __future__ import annotations + +import argparse +import json +import os +import time +import urllib.request +from urllib.error import ContentTooShortError +from pathlib import Path + + +SYSTEM_MESSAGE = "You are a helpful assistant." +DEFAULT_DATASET = "bosonai/EmergentTTS-Eval" +DEFAULT_SPLIT = "train" +WV_MOS_URL = "https://zenodo.org/record/6201162/files/wav2vec2.ckpt?download=1" + + +def _require_deps(): + try: + import numpy as np # noqa: F401 + from datasets import load_dataset # noqa: F401 + import librosa # noqa: F401 + import soundfile # noqa: F401 + from pydub import AudioSegment # noqa: F401 + from tqdm import tqdm # noqa: F401 + except Exception as e: # pragma: no cover + raise RuntimeError( + "Missing dependencies for EmergentTTS-Eval preparation.\n\n" + "Install into the repo venv:\n" + " cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval\n" + " . ./.venv/bin/activate\n" + " pip install datasets numpy pydub tqdm librosa soundfile\n" + ) from e + + +def _download_wv_mos(dst_path: Path, overwrite: bool) -> None: + dst_path.parent.mkdir(parents=True, exist_ok=True) + if dst_path.exists() and not overwrite: + return + tmp_path = dst_path.with_suffix(dst_path.suffix + ".tmp") + + # Zenodo downloads can occasionally fail with ContentTooShortError; retry. + max_attempts = 5 + for attempt in range(1, max_attempts + 1): + if tmp_path.exists(): + tmp_path.unlink() + try: + urllib.request.urlretrieve(WV_MOS_URL, str(tmp_path)) + tmp_path.replace(dst_path) + return + except ContentTooShortError as e: + # Partial download: wait and retry. + wait_s = min(5 * attempt, 30) + print(f"Warning: partial download for wv_mos.ckpt (attempt {attempt}/{max_attempts}): {e}") + time.sleep(wait_s) + except Exception as e: + wait_s = min(5 * attempt, 30) + print(f"Warning: failed downloading wv_mos.ckpt (attempt {attempt}/{max_attempts}): {e}") + time.sleep(wait_s) + + raise RuntimeError(f"Failed to download wv_mos.ckpt after {max_attempts} attempts: {WV_MOS_URL}") + + +def _write_benchmark_init(bench_dir: Path) -> None: + bench_dir.mkdir(parents=True, exist_ok=True) + init_path = bench_dir / "__init__.py" + init_path.write_text( + ( + "# EmergentTTS-Eval benchmark (NeMo-Skills)\n\n" + 'GENERATION_ARGS = "++prompt_format=openai"\n' + ), + encoding="utf-8", + ) + + +def _to_nemo_skills_entry(sample: dict) -> dict: + # MagpieTTS backend expects JSON with at least `text`. We also keep Emergent + # metadata to enable deterministic conversion/scoring later. + payload = { + "text": sample["text_to_synthesize"], + "text_to_synthesize": sample["text_to_synthesize"], + "category": sample["category"], + "evolution_depth": sample["evolution_depth"], + "language": sample["language"], + "unique_id_eval": sample["unique_id_eval"], + # Optional fields used by MagpieTTS evaluation code paths. + "context_audio_filepath": "", + "duration": 5.0, + "context_audio_duration": 5.0, + } + return { + "problem": "", + "messages": [ + {"role": "system", "content": SYSTEM_MESSAGE}, + {"role": "user", "content": json.dumps(payload, ensure_ascii=False)}, + ], + } + + +def main(): + _require_deps() + import numpy as np + from datasets import load_dataset + from pydub import AudioSegment + from tqdm import tqdm + + parser = argparse.ArgumentParser(description="Prepare EmergentTTS-Eval for NeMo-Skills") + parser.add_argument( + "--output_dir", + type=str, + default=str(Path(__file__).parent), + help="Where to create emergent_tts module structure (default: folder containing this script).", + ) + parser.add_argument("--dataset", type=str, default=DEFAULT_DATASET, help="HF dataset name") + parser.add_argument("--split", type=str, default=DEFAULT_SPLIT, help="HF split to download (train contains 1645)") + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite existing files (baseline audios, jsonl, wv_mos.ckpt, test.jsonl).", + ) + parser.add_argument( + "--num_samples", + type=int, + default=None, + help="Optional: limit number of samples (debug). If set, takes the first N rows.", + ) + args = parser.parse_args() + + output_dir = Path(args.output_dir).resolve() + data_dir = output_dir / "data" + baseline_audios_dir = data_dir / "baseline_audios" + baseline_audios_dir.mkdir(parents=True, exist_ok=True) + + # Emergent expected files + emergent_jsonl_path = data_dir / "emergent_tts_eval_data.jsonl" + wv_mos_path = data_dir / "wv_mos.ckpt" + + # NeMo-Skills benchmark module structure + bench_dir = output_dir / "emergent" + test_jsonl_path = bench_dir / "test.jsonl" + _write_benchmark_init(bench_dir) + + # Download dataset + dataset_hf = load_dataset(args.dataset, split=args.split) + total = len(dataset_hf) if args.num_samples is None else min(args.num_samples, len(dataset_hf)) + + if emergent_jsonl_path.exists() and test_jsonl_path.exists() and not args.overwrite: + print(f"Found existing outputs under {output_dir}. Use --overwrite to rebuild.") + else: + if args.overwrite: + for p in [emergent_jsonl_path, test_jsonl_path]: + if p.exists(): + p.unlink() + + emergent_records: list[dict] = [] + + # Build emergent jsonl + baseline audios + for i in tqdm(range(total), desc="Preparing EmergentTTS-Eval"): + curr = dataset_hf[i] + unique_id = i + + # Save baseline audio + wav_path = baseline_audios_dir / f"{unique_id}.wav" + if args.overwrite or not wav_path.exists(): + audio_array = curr["audio"]["array"] + audio_sr = int(curr["audio"]["sampling_rate"]) + audio_array_int16 = np.int16(audio_array * 32767) + audio_segment = AudioSegment( + audio_array_int16.tobytes(), + frame_rate=audio_sr, + sample_width=2, + channels=1, + ) + audio_segment.export(str(wav_path), format="wav") + + emergent_records.append( + { + "unique_id_eval": unique_id, + "category": curr["category"], + "text_to_synthesize": curr["text_to_synthesize"], + "evolution_depth": curr["evolution_depth"], + "language": curr["language"], + } + ) + + # Write emergent jsonl data file + emergent_jsonl_path.parent.mkdir(parents=True, exist_ok=True) + with open(emergent_jsonl_path, "w", encoding="utf-8") as f: + for rec in emergent_records: + f.write(json.dumps(rec, ensure_ascii=False) + "\n") + + # Write NeMo-Skills test.jsonl + with open(test_jsonl_path, "w", encoding="utf-8") as f: + for rec in emergent_records: + f.write(json.dumps(_to_nemo_skills_entry(rec), ensure_ascii=False) + "\n") + + # Download MOS model checkpoint (used by Emergent scoring) + _download_wv_mos(wv_mos_path, overwrite=args.overwrite) + + print("\nPrepared EmergentTTS-Eval:") + print(f" - data dir: {data_dir}") + print(f" - baseline audios: {baseline_audios_dir}") + print(f" - emergent jsonl: {emergent_jsonl_path}") + print(f" - wv_mos.ckpt: {wv_mos_path}") + print(f" - nemo-skills test.jsonl: {test_jsonl_path}") + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/__init__.py b/nemo_skills/dataset/emergent_tts/scripts/__init__.py new file mode 100644 index 0000000000..b1989f6c3b --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/__init__.py @@ -0,0 +1,2 @@ +"""Scripts for running EmergentTTS-Eval via NeMo-Skills.""" + diff --git a/nemo_skills/dataset/emergent_tts/scripts/check_deps.py b/nemo_skills/dataset/emergent_tts/scripts/check_deps.py new file mode 100644 index 0000000000..459cfc3311 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/check_deps.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +"""Dependency checker for EmergentTTS-Eval integration. + +This script is meant to fail fast with a clear actionable message when you are +missing Python packages needed for: +- dataset preparation (`prepare.py`) +- scoring (EmergentTTS-Eval-public `inference.py`) +""" + +from __future__ import annotations + +import argparse +import importlib +import os +from pathlib import Path + + +def _try_import(module: str) -> str | None: + try: + importlib.import_module(module) + return None + except Exception as e: + return f"{module} ({type(e).__name__}: {e})" + + +def _venv_install_hint(*, emergent_repo_path: str | None) -> str: + repo_root = Path(__file__).resolve().parents[4] # .../nemo_skills/dataset/emergent_tts/scripts + lines = [ + "To install missing deps into the repo venv:", + f" cd {repo_root}", + " . ./.venv/bin/activate", + " pip install -e .", + " pip install librosa soundfile", + ] + if emergent_repo_path: + lines.append(f" pip install -r {Path(emergent_repo_path).resolve()}/requirements.txt") + else: + lines.append(" pip install -r /path/to/EmergentTTS-Eval-public/requirements.txt") + return "\n".join(lines) + + +def main(): + p = argparse.ArgumentParser(description="Check dependencies for EmergentTTS-Eval integration") + p.add_argument("--stage", choices=["prepare", "scoring", "all"], default="all") + p.add_argument( + "--emergent_repo_path", + default=os.environ.get("EMERGENT_TTS_EVAL_REPO", ""), + help="Path to EmergentTTS-Eval-public (used only to print install hint)", + ) + args = p.parse_args() + + emergent_repo_path = args.emergent_repo_path or None + + missing: list[str] = [] + + if args.stage in ("prepare", "all"): + for mod in ["datasets", "numpy", "pydub", "tqdm", "librosa", "soundfile"]: + err = _try_import(mod) + if err: + missing.append(err) + + if args.stage in ("scoring", "all"): + # Minimal set required by EmergentTTS-Eval-public scoring path (fetch-audios mode) + for mod in [ + "torch", + "transformers", + "editdistance", + "whisper_normalizer", + "json_repair", + "tenacity", + "openai", + "google.genai", + "pydub", + "librosa", + "soundfile", + ]: + err = _try_import(mod) + if err: + missing.append(err) + + if missing: + print("Missing required dependencies:\n") + for m in missing: + print(f"- {m}") + print() + print(_venv_install_hint(emergent_repo_path=emergent_repo_path)) + raise SystemExit(2) + + print("All required dependencies are available.") + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml new file mode 100644 index 0000000000..5015e4151c --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml @@ -0,0 +1,61 @@ +# EmergentTTS-Eval pipeline configuration (example) +# +# NOTE: Before running generation, create the dataset under generation.data_dir: +# python nemo_skills/dataset/emergent_tts/prepare.py --output_dir /emergent_tts +# +# Then run: +# python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config --stage all + +cluster: eos +container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh +partition: batch +mount_paths: /lustre:/lustre + +# Where NeMo-Skills will write eval-results/ and eval-logs/ +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/emergent_tts_eval_full_8chunks + +# NeMo source checkout on EOS (needed for MagpieTTS inference modules). +nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + # One GPU for the server process. + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 16 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + + # Shared NeMo-Skills data_dir. Must contain emergent_tts/emergent/test.jsonl and emergent_tts/emergent/__init__.py + data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir + + # Full run: split across 8 chunks. + num_chunks: 8 + # Request 8 GPUs per node for the generation job allocation. + # (Generation itself calls the server; this matches nv_tts scheduling expectations.) + gpus_per_node: 8 + extra_args: ++server.server_type=vllm_multimodal + +scoring: + gpus: 1 + # Container for scoring jobs (conversion + Emergent eval). Use the same container + # as the generation "main" job (not the Magpie server container). + container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh + # Install missing Python deps at job start (runs inside the scoring container). + # Keep this conservative: avoid upgrading core deps inside the base container. + installation_command: pip install editdistance whisper-normalizer json-repair tenacity + + # Path to EmergentTTS-Eval-public on the cluster (added to PYTHONPATH) + scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/EmergentTTS-Eval-public + + # Path to Emergent data directory created by prepare.py: + # /emergent_tts/data + emergent_data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir/emergent_tts/data + + # Judge configuration (OpenAI-compatible via NVIDIA Inference API) + judge_model: gcp/google/gemini-2.5-pro + judger_base_url: https://inference-api.nvidia.com/v1/chat/completions + num_threads: 8 + evaluate_function: win_rate + strong_prompting: false + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml new file mode 100644 index 0000000000..dfef60facf --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml @@ -0,0 +1,22 @@ +# EmergentTTS-Eval: interactive 10-sample generation smoke test + +cluster: eos +container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh +partition: interactive +mount_paths: /lustre:/lustre + +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/emergent_tts_smoke10 +nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 32 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir + num_chunks: 1 + gpus_per_node: 1 + extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml new file mode 100644 index 0000000000..e50bca8e2f --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml @@ -0,0 +1,30 @@ +# EmergentTTS-Eval: local (docker) 10-sample generation smoke test +# +# Usage: +# export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/cluster_configs +# python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config --stage generation + +cluster: local_nemo_25_11 +partition: interactive + +output_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/_local_runs/emergent_tts_smoke10 + +# Optional: if your local setup needs a NeMo source checkout for MagpieTTS inference, +# set this to an absolute host path and ensure it's mounted in the local cluster config. +nemo_code_path: /home/vmendelev/workspace/expressiveness/src/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 8 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + + # Use the repo dataset folder (contains emergent_tts/emergent/test.jsonl). + data_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/nemo_skills/dataset + + num_chunks: 1 + gpus_per_node: 1 + extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml new file mode 100644 index 0000000000..6816e5b523 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml @@ -0,0 +1,26 @@ +# EmergentTTS-Eval: local (docker) 10-sample generation smoke test +# Using the base NeMo container (nvcr.io/nvidia/nemo:25.11). + +cluster: local_nemo_25_11_base +partition: interactive + +# For local docker runs, set the container to use for the *server* task. +# (The main task uses cluster_config.containers.nemo-skills.) +container: nvcr.io/nvidia/nemo:25.11 + +output_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/_local_runs/emergent_tts_smoke10_base + +nemo_code_path: /home/vmendelev/workspace/expressiveness/src/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 8 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + data_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/nemo_skills/dataset + num_chunks: 1 + gpus_per_node: 1 + extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal + diff --git a/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py b/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py new file mode 100644 index 0000000000..66ab564990 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. + +from __future__ import annotations + +import argparse +import json +import os +import shutil +from pathlib import Path + + +def _extract_user_json(record: dict) -> dict | None: + for msg in record.get("messages", []): + if msg.get("role") != "user": + continue + content = msg.get("content") + if isinstance(content, dict): + return content + if isinstance(content, str): + try: + return json.loads(content) + except json.JSONDecodeError: + return None + return None + + +def _link_or_copy(src: str, dst: str, mode: str): + if mode == "symlink": + if os.path.islink(dst): + if os.readlink(dst) == src: + return + os.unlink(dst) + elif os.path.exists(dst): + os.unlink(dst) + os.symlink(src, dst) + return + + if mode == "copy": + shutil.copyfile(src, dst) + return + + raise ValueError(f"Unknown mode: {mode}") + + +def main(): + p = argparse.ArgumentParser(description="Convert NeMo-Skills TTS outputs into Emergent audio layout") + p.add_argument("--ns_output_jsonl", required=True, help="Path to NeMo-Skills output.jsonl") + p.add_argument("--out_dir", required=True, help="Destination directory for .wav") + p.add_argument("--mode", choices=["symlink", "copy"], default="symlink") + p.add_argument("--overwrite", action="store_true") + args = p.parse_args() + + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + converted = 0 + skipped = 0 + missing = 0 + + with open(args.ns_output_jsonl, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + record = json.loads(line) + user_json = _extract_user_json(record) or {} + unique_id = user_json.get("unique_id_eval", record.get("unique_id_eval")) + audio_path = (record.get("audio") or {}).get("path") + + if unique_id is None: + skipped += 1 + continue + if not audio_path or not os.path.exists(audio_path): + missing += 1 + continue + + dst = out_dir / f"{unique_id}.wav" + if dst.exists() and not args.overwrite: + continue + _link_or_copy(audio_path, str(dst), args.mode) + converted += 1 + + print( + f"Converted {converted} files into {out_dir}. " + f"skipped(no unique_id_eval)={skipped}, missing_audio={missing}" + ) + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py new file mode 100644 index 0000000000..19d08f3497 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. + +""" +Emergent TTS Pipeline: Generation -> Scoring (-> Aggregation) + +This mirrors `nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py` but uses +EmergentTTS-Eval scoring logic. +""" + +import argparse +import os + +import yaml + +from nemo_skills.pipeline.eval import eval as ns_eval +from nemo_skills.pipeline.run_cmd import run_cmd as ns_run_cmd + + +class MockContext: + """Mock typer.Context for programmatic calls.""" + + def __init__(self, extra_args=None): + self.args = extra_args or [] + + +def load_config(config_path: str) -> dict: + with open(config_path) as f: + return yaml.safe_load(f) + + +def run_generation(cfg: dict, expname: str): + gen = cfg["generation"] + # Mirror nv_tts behavior: allow injecting a NeMo source checkout into PYTHONPATH + # for the unified server (MagpieTTS inference code lives in NeMo). + server_args = gen["server_args"] + generation_code_path = cfg.get("generation_code_path") or cfg.get("nemo_code_path") + if generation_code_path: + server_args += f" --code_path {generation_code_path}" + + extra_args = gen.get("extra_args", "").split() if gen.get("extra_args") else [] + ctx = MockContext(extra_args) + return ns_eval( + ctx=ctx, + cluster=cfg["cluster"], + output_dir=cfg["output_dir"], + benchmarks=gen["benchmarks"], + model=gen["model"], + server_type=gen["server_type"], + server_gpus=gen["server_gpus"], + # Local executor doesn't require explicit container/mount_paths in the run YAML. + # For slurm clusters these are required and should be present in the config. + server_container=cfg.get("container", ""), + mount_paths=cfg.get("mount_paths", ""), + server_entrypoint=gen["server_entrypoint"], + server_args=server_args, + data_dir=gen["data_dir"], + num_chunks=gen["num_chunks"], + gpus_per_node=gen.get("gpus_per_node", 1), + partition=cfg["partition"], + expname=expname, + auto_summarize_results=False, + ) + + +def main(): + parser = argparse.ArgumentParser(description="Emergent TTS Eval Pipeline") + parser.add_argument("--config", required=True) + parser.add_argument( + "--stage", + choices=["all", "generation", "scoring", "aggregation"], + default="all", + ) + parser.add_argument("--expname", default="emergent_tts_eval") + args = parser.parse_args() + + cfg = load_config(args.config) + scoring = cfg.get("scoring", {}) + output_dir = cfg["output_dir"] + + gen_exp_name = None + + if args.stage in ("all", "generation"): + print("\n" + "=" * 60) + print("Stage 1: GENERATION") + print("=" * 60) + gen_exp = run_generation(cfg, args.expname) + gen_exp_name = args.expname + print(f"Generation submitted: {gen_exp}") + + if args.stage in ("all", "scoring"): + print("\n" + "=" * 60) + print("Stage 2: SCORING (EmergentTTS-Eval)") + print("=" * 60) + + benchmarks = cfg["generation"]["benchmarks"].split(",") + run_after = [gen_exp_name] if args.stage == "all" and gen_exp_name else None + + scoring_code_path = scoring.get("scoring_code_path", "") + emergent_data_dir = scoring.get("emergent_data_dir", "") + install_cmd = scoring.get("installation_command") + scoring_container = scoring.get("container") or "nemo-skills" + + # Required by Emergent's judge clients + judger_api_key = ( + os.environ.get("JUDGER_API_KEY") + or os.environ.get("NVIDIA_API_KEY") + or os.environ.get("OPENAI_API_KEY") + or "" + ) + if not judger_api_key: + print("Warning: JUDGER_API_KEY/NVIDIA_API_KEY/OPENAI_API_KEY not set; win_rate judging may fail.") + + for benchmark in benchmarks: + benchmark = benchmark.strip() + short_name = benchmark.split(".")[-1] + score_cmd = ( + f"JUDGER_API_KEY={judger_api_key} " + f"PYTHONPATH={scoring_code_path}:$PYTHONPATH " + f"python -m nemo_skills.dataset.emergent_tts.scripts.score " + f"--results_dir {output_dir} " + f"--benchmark {benchmark} " + f"--emergent_data_dir {emergent_data_dir} " + f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} " + f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} " + f"--num_threads {int(scoring.get('num_threads', 8))} " + f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}" + ) + if scoring.get("strong_prompting"): + score_cmd += " --strong_prompting" + + ns_run_cmd( + ctx=MockContext(), + cluster=cfg["cluster"], + container=scoring_container, + partition=cfg["partition"], + num_gpus=int(scoring.get("gpus", 1)), + mount_paths=cfg["mount_paths"], + command=score_cmd, + installation_command=install_cmd, + run_after=run_after, + expname=f"{args.expname}_score_{short_name}", + log_dir=f"{output_dir}/eval-logs", + ) + + if args.stage == "aggregation": + print("\n" + "=" * 60) + print("Stage 3: AGGREGATION") + print("=" * 60) + agg_cmd = f"python -m nemo_skills.dataset.emergent_tts.scripts.score --results_dir {output_dir} --aggregation_only" + ns_run_cmd( + ctx=MockContext(), + cluster=cfg["cluster"], + container=cfg["container"], + partition=cfg["partition"], + num_gpus=0, + mount_paths=cfg["mount_paths"], + command=agg_cmd, + expname=f"{args.expname}_agg", + log_dir=f"{output_dir}/eval-logs", + ) + + print("\nDone!") + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/score.py b/nemo_skills/dataset/emergent_tts/scripts/score.py new file mode 100644 index 0000000000..ec5c77c58b --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/score.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. + +"""Run EmergentTTS-Eval scoring on NeMo-Skills generated audio. + +This script expects NeMo-Skills generation output layout: + /eval-results//output.jsonl + +It will: + 1) Convert NeMo-Skills `output.jsonl` audio paths into Emergent layout + (/emergent-tts-eval_output-audios/.wav) + 2) Run Emergent scoring in fetch-audios mode (no re-generation) + 3) Write `metrics.json` in the benchmark folder for consistency with other evals +""" + +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + + +def _benchmarks_dir(results_dir: str) -> Path: + p = Path(results_dir) / "eval-results" + return p if p.exists() else Path(results_dir) + + +def _normalize_openai_base_url(url: str) -> str: + # Some callers pass the full endpoint; OpenAI client expects base URL. + suffix = "/v1/chat/completions" + if url.endswith(suffix): + return url[: -len("/chat/completions")] + return url + + +class _NoopModelClient: + """A minimal Emergent model_client for scoring-only runs.""" + + def prepare_emergent_tts_sample(self, text_to_synthesize, category, strong_prompting, prompting_object, **kwargs): + if strong_prompting: + user_message = ( + prompting_object.USER_MESSAGE_STRONG_TEMPLATE.replace( + "{{{descriptions}}}", prompting_object.ALL_DESCRIPTIONS[category] + ).replace("{{{text_to_synthesize}}}", text_to_synthesize) + ) + else: + user_message = prompting_object.USER_MESSAGE_DEFAULT_TEMPLATE.replace( + "{{{text_to_synthesize}}}", text_to_synthesize + ) + return prompting_object.SYSTEM_PROMPT_DEFAULT, user_message + + +def _convert(ns_output_jsonl: Path, out_dir: Path, overwrite: bool) -> None: + from nemo_skills.dataset.emergent_tts.scripts.convert_ns_outputs_to_emergent import main as convert_main + + # Reuse converter as a library via argv. + import sys + + argv = sys.argv + try: + sys.argv = [ + argv[0], + "--ns_output_jsonl", + str(ns_output_jsonl), + "--out_dir", + str(out_dir), + "--mode", + "symlink", + ] + (["--overwrite"] if overwrite else []) + convert_main() + finally: + sys.argv = argv + + +def _run_emergent_scoring( + *, + benchmark_dir: Path, + emergent_data_base_path: Path, + fetch_audios_from_path: Path, + baseline_audios_path: Path, + judge_model: str, + judger_base_url: str, + num_threads: int, + depths_to_evaluate: str, + categories_to_evaluate: str, + evaluate_function: str, + strong_prompting: bool, +): + # Import from EmergentTTS-Eval-public (caller should add it to PYTHONPATH). + import inference as emergent_inference # type: ignore + + # Tell Emergent code where to find `emergent_tts_eval_data.jsonl` and `wv_mos.ckpt`. + os.environ["EMERGENT_TTS_DATA_BASE_PATH"] = str(emergent_data_base_path) + + emergent_inference.eval_api_closed_model( + model_client=_NoopModelClient(), + accelerator=None, + depths_to_evaluate=depths_to_evaluate, + categories_to_evaluate=categories_to_evaluate, + seed=42, + output_dir=str(benchmark_dir), + num_samples=None, + baseline_audios_path=str(baseline_audios_path), + fetch_audios_from_path=str(fetch_audios_from_path), + judge_model=judge_model, + temperature=0.0, + evaluate_function=evaluate_function, + strong_prompting=strong_prompting, + judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None, + num_threads=num_threads, + model_name="nemo-skills-generated", + ) + + +def run_scoring( + *, + results_dir: str, + benchmark: str | None, + emergent_data_dir: str, + judge_model: str, + judger_base_url: str, + num_threads: int, + depths_to_evaluate: str, + categories_to_evaluate: str, + evaluate_function: str, + strong_prompting: bool, + overwrite_converted: bool, +): + bdir = _benchmarks_dir(results_dir) + emergent_data_dir_p = Path(emergent_data_dir) + emergent_base = emergent_data_dir_p # expects emergent_tts_eval_data.jsonl and wv_mos.ckpt here + baseline_audios = emergent_data_dir_p / "baseline_audios" + + if benchmark: + benches = [benchmark] + else: + benches = [p.name for p in bdir.iterdir() if p.is_dir()] + + for bench in sorted(benches): + bench_dir = bdir / bench + output_jsonl = bench_dir / "output.jsonl" + if not output_jsonl.exists(): + print(f"Skipping {bench}: output.jsonl not found") + continue + + # Emergent uses this naming convention for generated audio dir (see inference.py). + converted_audio_dir = bench_dir / "emergent-tts-eval_output-audios" + converted_audio_dir.mkdir(parents=True, exist_ok=True) + _convert(output_jsonl, converted_audio_dir, overwrite=overwrite_converted) + + # Run Emergent scoring (writes emergent-tts-eval_* files into bench_dir) + _run_emergent_scoring( + benchmark_dir=bench_dir, + emergent_data_base_path=emergent_base, + fetch_audios_from_path=converted_audio_dir, + baseline_audios_path=baseline_audios, + judge_model=judge_model, + judger_base_url=judger_base_url, + num_threads=num_threads, + depths_to_evaluate=depths_to_evaluate, + categories_to_evaluate=categories_to_evaluate, + evaluate_function=evaluate_function, + strong_prompting=strong_prompting, + ) + + # Convert Emergent metrics file into `metrics.json` for NeMo-Skills conventions. + # Emergent prefix matches inference.py defaults when strong_prompting=False and voice_to_use=None. + emergent_metrics_path = bench_dir / "emergent-tts-eval_evaluation-metrics.json" + if emergent_metrics_path.exists(): + with open(emergent_metrics_path, "r", encoding="utf-8") as f: + metrics = json.load(f) + with open(bench_dir / "metrics.json", "w", encoding="utf-8") as f: + json.dump(metrics, f, indent=2) + print(f"[{bench}] Saved: {bench_dir/'metrics.json'}") + else: + print(f"[{bench}] Warning: Emergent metrics file not found at {emergent_metrics_path}") + + +def run_aggregation(results_dir: str): + bdir = _benchmarks_dir(results_dir) + print("\nAggregated Results (EmergentTTS-Eval):") + for benchmark in sorted([p.name for p in bdir.iterdir() if p.is_dir()]): + metrics_path = bdir / benchmark / "metrics.json" + if not metrics_path.exists(): + continue + with open(metrics_path, "r", encoding="utf-8") as f: + metrics = json.load(f) + # Keep this minimal; Emergent metrics are keyed like eval/wer, eval/mos, eval/win_rate, etc. + wer = metrics.get("eval/wer") + mos = metrics.get("eval/mos") + win = metrics.get("eval/win_rate") + print(f" {benchmark}:") + if wer is not None: + print(f" WER: {wer:.4f}") + if mos is not None: + print(f" MOS: {mos:.4f}") + if win is not None: + print(f" Win-rate: {win:.4f}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="EmergentTTS-Eval scoring for NeMo-Skills outputs") + parser.add_argument("--results_dir", required=True) + parser.add_argument("--benchmark", default=None, help="Score only this benchmark (e.g. emergent_tts.emergent)") + parser.add_argument("--aggregation_only", action="store_true") + + parser.add_argument( + "--emergent_data_dir", + required=False, + default=None, + help="Path containing Emergent files: emergent_tts_eval_data.jsonl, wv_mos.ckpt, baseline_audios/", + ) + parser.add_argument("--judge_model", default="gcp/google/gemini-2.5-pro") + parser.add_argument("--judger_base_url", default="https://inference-api.nvidia.com/v1/chat/completions") + parser.add_argument("--num_threads", type=int, default=8) + parser.add_argument("--depths_to_evaluate", default="0,1,2,3") + parser.add_argument( + "--categories_to_evaluate", + default="Emotions,Paralinguistics,Syntactic Complexity,Foreign Words,Questions,Pronunciation", + ) + parser.add_argument("--evaluate_function", default="win_rate") + parser.add_argument("--strong_prompting", action="store_true") + parser.add_argument("--overwrite_converted", action="store_true") + args = parser.parse_args() + + if args.aggregation_only: + run_aggregation(args.results_dir) + else: + emergent_data_dir = args.emergent_data_dir + if emergent_data_dir is None: + # Try to derive from NEMO_SKILLS_DATA_DIR (common in cluster configs). + emergent_data_dir = os.environ.get("EMERGENT_TTS_DATA_BASE_PATH") or os.environ.get("NEMO_SKILLS_DATA_DIR") + if emergent_data_dir: + emergent_data_dir = str(Path(emergent_data_dir) / "emergent_tts" / "data") + if emergent_data_dir is None: + raise SystemExit("--emergent_data_dir is required (or set EMERGENT_TTS_DATA_BASE_PATH/NEMO_SKILLS_DATA_DIR)") + + run_scoring( + results_dir=args.results_dir, + benchmark=args.benchmark, + emergent_data_dir=emergent_data_dir, + judge_model=args.judge_model, + judger_base_url=args.judger_base_url, + num_threads=args.num_threads, + depths_to_evaluate=args.depths_to_evaluate, + categories_to_evaluate=args.categories_to_evaluate, + evaluate_function=args.evaluate_function, + strong_prompting=args.strong_prompting, + overwrite_converted=args.overwrite_converted, + ) + From f7cc464d7ad960342e28c52dc84ea0ee47f96f60 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Tue, 3 Feb 2026 06:13:48 -0800 Subject: [PATCH 2/3] Fix Emergent scoring deps and paths Install google-genai for EmergentTTS-Eval, run scoring from the dataset base dir so relative paths resolve, and avoid shipping large local caches/data. Document EmergentTTS-Eval usage in nv_tts guide. Co-authored-by: Cursor --- .gitignore | 8 ++++ .../emergent_tts/scripts/config/default.yaml | 4 +- .../emergent_tts/scripts/run_tts_eval.py | 27 ++++++----- .../dataset/emergent_tts/scripts/score.py | 45 +++++++++++-------- 4 files changed, 55 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index e5adf3c582..731d1af5b0 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,14 @@ build .venv *.lock +# Local caches / secrets (never ship to remote via rsync) +.ssh/ +.hf_cache/ +.nemo_run/ + +# Emergent dataset artifacts (large; stored in shared data_dir instead) +nemo_skills/dataset/emergent_tts/data/ + __pycache__ .ipynb_checkpoints diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml index 5015e4151c..9ffb01781f 100644 --- a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml +++ b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml @@ -43,7 +43,9 @@ scoring: container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh # Install missing Python deps at job start (runs inside the scoring container). # Keep this conservative: avoid upgrading core deps inside the base container. - installation_command: pip install editdistance whisper-normalizer json-repair tenacity + # EmergentTTS-Eval imports `from google import genai`, so ensure google-genai exists + # but install it without pulling/upgrading transitive deps (to avoid httpx/transformers churn). + installation_command: pip install editdistance whisper-normalizer json-repair tenacity && pip install --no-deps google-genai # Path to EmergentTTS-Eval-public on the cluster (added to PYTHONPATH) scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/EmergentTTS-Eval-public diff --git a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py index 19d08f3497..2b12fd87b6 100644 --- a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py +++ b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py @@ -10,6 +10,7 @@ import argparse import os +from pathlib import Path import yaml @@ -100,6 +101,7 @@ def main(): emergent_data_dir = scoring.get("emergent_data_dir", "") install_cmd = scoring.get("installation_command") scoring_container = scoring.get("container") or "nemo-skills" + emergent_data_base_dir = str(Path(emergent_data_dir).parent) if emergent_data_dir else "" # Required by Emergent's judge clients judger_api_key = ( @@ -115,16 +117,17 @@ def main(): benchmark = benchmark.strip() short_name = benchmark.split(".")[-1] score_cmd = ( - f"JUDGER_API_KEY={judger_api_key} " - f"PYTHONPATH={scoring_code_path}:$PYTHONPATH " - f"python -m nemo_skills.dataset.emergent_tts.scripts.score " - f"--results_dir {output_dir} " - f"--benchmark {benchmark} " - f"--emergent_data_dir {emergent_data_dir} " - f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} " - f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} " - f"--num_threads {int(scoring.get('num_threads', 8))} " - f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}" + (f"cd {emergent_data_base_dir} && " if emergent_data_base_dir else "") + + f"JUDGER_API_KEY={judger_api_key} " + + f"PYTHONPATH={scoring_code_path}:$PYTHONPATH " + + "python -m nemo_skills.dataset.emergent_tts.scripts.score " + + f"--results_dir {output_dir} " + + f"--benchmark {benchmark} " + + f"--emergent_data_dir {emergent_data_dir} " + + f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} " + + f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} " + + f"--num_threads {int(scoring.get('num_threads', 8))} " + + f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}" ) if scoring.get("strong_prompting"): score_cmd += " --strong_prompting" @@ -139,6 +142,9 @@ def main(): command=score_cmd, installation_command=install_cmd, run_after=run_after, + # Ensure we ship the current repo state for scoring jobs. + # (Otherwise nemo_run may reuse an older code snapshot and miss fixes.) + reuse_code=False, expname=f"{args.expname}_score_{short_name}", log_dir=f"{output_dir}/eval-logs", ) @@ -156,6 +162,7 @@ def main(): num_gpus=0, mount_paths=cfg["mount_paths"], command=agg_cmd, + reuse_code=False, expname=f"{args.expname}_agg", log_dir=f"{output_dir}/eval-logs", ) diff --git a/nemo_skills/dataset/emergent_tts/scripts/score.py b/nemo_skills/dataset/emergent_tts/scripts/score.py index ec5c77c58b..3ff0a0ca6b 100644 --- a/nemo_skills/dataset/emergent_tts/scripts/score.py +++ b/nemo_skills/dataset/emergent_tts/scripts/score.py @@ -93,24 +93,33 @@ def _run_emergent_scoring( # Tell Emergent code where to find `emergent_tts_eval_data.jsonl` and `wv_mos.ckpt`. os.environ["EMERGENT_TTS_DATA_BASE_PATH"] = str(emergent_data_base_path) - emergent_inference.eval_api_closed_model( - model_client=_NoopModelClient(), - accelerator=None, - depths_to_evaluate=depths_to_evaluate, - categories_to_evaluate=categories_to_evaluate, - seed=42, - output_dir=str(benchmark_dir), - num_samples=None, - baseline_audios_path=str(baseline_audios_path), - fetch_audios_from_path=str(fetch_audios_from_path), - judge_model=judge_model, - temperature=0.0, - evaluate_function=evaluate_function, - strong_prompting=strong_prompting, - judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None, - num_threads=num_threads, - model_name="nemo-skills-generated", - ) + # EmergentTTS-Eval expects paths like "data/emergent_tts_eval_data.jsonl" relative + # to its *data base directory* (repo root). We keep the dataset in a shared path: + # <...>/emergent_tts/data/{emergent_tts_eval_data.jsonl,wv_mos.ckpt,baseline_audios/} + # So we temporarily `chdir` into the directory that contains the "data/" folder. + prev_cwd = os.getcwd() + try: + os.chdir(str(emergent_data_base_path.parent)) + emergent_inference.eval_api_closed_model( + model_client=_NoopModelClient(), + accelerator=None, + depths_to_evaluate=depths_to_evaluate, + categories_to_evaluate=categories_to_evaluate, + seed=42, + output_dir=str(benchmark_dir), + num_samples=None, + baseline_audios_path=str(baseline_audios_path), + fetch_audios_from_path=str(fetch_audios_from_path), + judge_model=judge_model, + temperature=0.0, + evaluate_function=evaluate_function, + strong_prompting=strong_prompting, + judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None, + num_threads=num_threads, + model_name="nemo-skills-generated", + ) + finally: + os.chdir(prev_cwd) def run_scoring( From 351b1bf5ab93265f487d1b5a0d50c472935119e6 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 6 Feb 2026 02:23:50 -0800 Subject: [PATCH 3/3] Add emergent_tts README Document dataset preparation (HF_TOKEN) and evaluation workflow, including cloning and patching EmergentTTS-Eval for NVIDIA Inference API judging. Co-authored-by: Cursor --- nemo_skills/dataset/emergent_tts/README.md | 124 +++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 nemo_skills/dataset/emergent_tts/README.md diff --git a/nemo_skills/dataset/emergent_tts/README.md b/nemo_skills/dataset/emergent_tts/README.md new file mode 100644 index 0000000000..140dbb5533 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/README.md @@ -0,0 +1,124 @@ +## EmergentTTS-Eval dataset (`emergent_tts`) + +This dataset integration lets you: + +- **Prepare** the EmergentTTS-Eval test set under a shared `data_dir` (download baseline audios + metadata + MOS model). +- **Generate** TTS outputs with NeMo-Skills (`ns eval` via `run_tts_eval.py`). +- **Score** the generated outputs with EmergentTTS-Eval (WER/MOS/win-rate, depending on config). + +### 1) Prepare the test set (requires `HF_TOKEN`) + +`prepare.py` downloads the dataset and writes all required artifacts into: + +- `/emergent_tts/emergent/test.jsonl` +- `/emergent_tts/data/emergent_tts_eval_data.jsonl` +- `/emergent_tts/data/baseline_audios/*.wav` +- `/emergent_tts/data/wv_mos.ckpt` + +Run it from your dev machine (or any environment with network access): + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval +. ./.venv/bin/activate + +export HF_TOKEN="" + +python nemo_skills/dataset/emergent_tts/prepare.py \ + --output_dir "/emergent_tts" +``` + +Optional flags: + +- `--num_samples 10`: write only the first 10 samples (smoke test). +- `--overwrite`: re-download / regenerate outputs. + +### 2) Configure evaluation + +Use the example configs in `nemo_skills/dataset/emergent_tts/scripts/config/`. + +In `scripts/config/default.yaml`, set: + +- `generation.data_dir: ` +- `scoring.emergent_data_dir: /emergent_tts/data` +- `scoring.scoring_code_path: /EmergentTTS-Eval-public` (on the cluster) + +### 3) Clone + patch EmergentTTS-Eval-public for NVIDIA Inference API judging + +On EOS (or wherever you run scoring), clone EmergentTTS-Eval: + +```bash +cd /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code +git clone EmergentTTS-Eval-public +``` + +Then update Emergent’s judge client selection so that **Gemini models are called via NVIDIA’s OpenAI-compatible Inference API**. + +Target behavior: + +- **Model name** stays as: `gcp/google/gemini-2.5-pro` (or similar). +- **Base URL** is NVIDIA Inference API: `https://inference-api.nvidia.com/v1` +- **API key** comes from: `JUDGER_API_KEY` (or `NVIDIA_API_KEY`) + +Minimal patch checklist inside `EmergentTTS-Eval-public`: + +- In `api_clients.py` (or wherever the client is chosen), ensure `gcp/google/*` uses an **OpenAI-compatible** client (not the Google SDK client), e.g.: + - `OpenAI(base_url=, api_key=os.getenv("JUDGER_API_KEY"))` +- Thread `judger_base_url` through so calls use `https://inference-api.nvidia.com/v1` (not the full `/v1/chat/completions` endpoint). + +After patching, set these in `scripts/config/default.yaml`: + +- `scoring.judge_model: gcp/google/gemini-2.5-pro` +- `scoring.judger_base_url: https://inference-api.nvidia.com/v1/chat/completions` + +### 3) Run evaluation (generation + scoring) + +From your dev machine, submit jobs to EOS: + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval +. ./.venv/bin/activate +mkdir -p .nemo_run + +export NEMORUN_HOME="$PWD/.nemo_run" +export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/ns_eval/cluster_configs +export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 + +# Required for win-rate judging (NVIDIA Inference API key) +export JUDGER_API_KEY="" + +python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \ + --config nemo_skills/dataset/emergent_tts/scripts/config/default.yaml \ + --stage all \ + --expname emergent_eval +``` + +### 4) Smoke test (10 samples, interactive) + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval +. ./.venv/bin/activate +mkdir -p .nemo_run + +export NEMORUN_HOME="$PWD/.nemo_run" +export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/ns_eval/cluster_configs +export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 + +python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \ + --config nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml \ + --stage generation \ + --expname emergent_smoke10 +``` + +### Outputs + +NeMo-Skills generation writes: + +- `/eval-results/emergent_tts.emergent/output.jsonl` +- `/eval-results/emergent_tts.emergent/audio/*.wav` (or equivalent) + +Emergent scoring writes (in the same benchmark folder): + +- `emergent-tts-eval_*_evaluation-predictions.jsonl` +- `emergent-tts-eval_*_evaluation-metrics.json` +- `metrics.json` (a NeMo-Skills-friendly copy of Emergent metrics) +