NVIDIA-NeMo · wasiahmad · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/nemo_skills/dataset/asr-leaderboard/__init__.py b/nemo_skills/dataset/asr-leaderboard/__init__.py
@@ -14,9 +14,10 @@
 
 # Settings that define how evaluation should be done by default (all can be changed from cmdline)
 
-# Uses the audio evaluator which computes WER with HuggingFace leaderboard preprocessing
+# Uses the audio evaluator which computes WER with Whisper-style text normalization
+# Data samples should have task_type="ASR" for proper WER calculation
 
 REQUIRES_DATA_DIR = True
 METRICS_TYPE = "audio"
-EVAL_ARGS = "++eval_type=audio ++eval_config.normalization_mode=hf_leaderboard"
-GENERATION_ARGS = "++prompt_format=openai ++enable_audio=true"
+EVAL_ARGS = "++eval_type=audio"
+GENERATION_ARGS = "++prompt_format=openai"
diff --git a/nemo_skills/evaluation/evaluator/audio.py b/nemo_skills/evaluation/evaluator/audio.py
@@ -17,6 +17,7 @@
 import asyncio
 import logging
 import re
+from functools import lru_cache
 from typing import Any
 
 import numpy as np
@@ -168,8 +169,8 @@ def evaluate_asr_pc(
     wer_c = jiwer.wer(ref_c, hyp_c)
 
     if normalize_standard_wer:
-        ref_std = preprocess_asr_text(reference, mode=normalization_mode)
-        hyp_std = preprocess_asr_text(hypothesis, mode=normalization_mode)
+        ref_std = preprocess_asr_text(reference)
+        hyp_std = preprocess_asr_text(hypothesis)
     else:
         ref_std = normalize_whitespace(re.sub(r"[^\w\s]", "", reference.lower()))
         hyp_std = normalize_whitespace(re.sub(r"[^\w\s]", "", hypothesis.lower()))
@@ -276,57 +277,17 @@ def resolve_asr_normalization_mode(config: AudioEvaluatorConfig) -> str:
     return config.normalization_mode if config.apply_whisper_normalization else "none"
 
 
-def preprocess_asr_text(text: str, mode: str = "standard") -> str:
-    """Normalize ASR text for WER calculation.
-
-    Args:
-        text: Raw text.
-        mode: Normalization mode:
-            - "standard": Whisper normalization (default) - converts number words to digits
-            - "audiobench": Full AudioBench normalization (whisper + digits to words + more)
-            - "hf_leaderboard": HuggingFace leaderboard style (whisper normalization)
-            - "none": No normalization (whitespace only)
-            - "no_tn_itn": Lowercase + remove punctuation, no number word conversion (for TN/ITN eval)
-    """
-    if mode not in VALID_NORMALIZATION_MODES:
-        raise ValueError(
-            f"Invalid normalization_mode '{mode}'. Available options: {', '.join(VALID_NORMALIZATION_MODES)}"
-        )
-
-    if mode == "none":
-        return re.sub(r"\s+", " ", text).strip()
-
-    if mode == "no_tn_itn":
-        # Lowercase + remove punctuation + whitespace normalization
-        text = text.lower()
-        text = re.sub(r"[^\w\s]", "", text)
-        return re.sub(r"\s+", " ", text).strip()
-
-    # "standard", "audiobench", and "hf_leaderboard" all use whisper normalization
+@lru_cache(maxsize=1)
+def _get_english_normalizer():
+    """Lazily initialize and cache the English text normalizer."""
     from whisper_normalizer.english import EnglishTextNormalizer
 
-    text = text.lower()
-    text = EnglishTextNormalizer()(text)
-
-    if mode == "audiobench":
-        # Additional audiobench-specific normalization
-        import jiwer
-
-        text = _normalize_digits_to_words(text)
-        text = _expand_contractions(text)
-        text = re.sub(r"(\[|\(|\{|\<)[^\(\)\\n\[\]]*(\]|\)|\}|\>)", "", text)
-        jiwer_process = jiwer.Compose(
-            [
-                jiwer.RemoveMultipleSpaces(),
-                jiwer.ExpandCommonEnglishContractions(),
-                jiwer.RemoveKaldiNonWords(),
-                jiwer.RemovePunctuation(),
-            ]
-        )
-        text = jiwer_process(text)
-        text = _remove_non_speech_elements(text)
+    return EnglishTextNormalizer()
 
-    return re.sub(r"\s+", " ", text).strip()
+
+def preprocess_asr_text(text: str) -> str:
+    """Apply Whisper-style normalization (lowercase, remove brackets, normalize whitespace)."""
+    return _get_english_normalizer()(text)
 
 
 def evaluate_asr(reference: str, hypothesis: str, normalization_mode: str = "standard") -> dict[str, Any]:
@@ -339,8 +300,12 @@ def evaluate_asr(reference: str, hypothesis: str, normalization_mode: str = "sta
     """
     import jiwer
 
-    ref = preprocess_asr_text(reference, mode=normalization_mode)
-    hyp = preprocess_asr_text(hypothesis, mode=normalization_mode)
+    ref = preprocess_asr_text(reference)
+    hyp = preprocess_asr_text(hypothesis)
+
+    # Store normalized texts before empty substitution
+    text = ref
+    pred_text = hyp
 
     if not ref:
         ref = "empty"
@@ -352,8 +317,8 @@ def evaluate_asr(reference: str, hypothesis: str, normalization_mode: str = "sta
     return {
         "wer": wer_score,
         "is_correct": wer_score < 0.5,
-        "text": ref,
-        "pred_text": hyp,
+        "text": text,
+        "pred_text": pred_text,
     }
 
 

diff --git a/nemo_skills/training/nemo_rl/start_sft.py b/nemo_skills/training/nemo_rl/start_sft.py
@@ -36,6 +36,7 @@
 from transformers import AutoTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
+from nemo_skills.prompt.utils import load_config as load_prompt_config
 from nemo_skills.utils import setup_make_sequence_length_divisible_by
 
 TokenizerType = PreTrainedTokenizerBase
@@ -88,6 +89,7 @@ def __init__(
         output_key: str = "output",
         num_proc: int | None = None,
         force_reprocess: bool = False,
+        input_template_path: str | None = None,
     ):
         self.input_key = input_key
         self.output_key = output_key
@@ -100,6 +102,13 @@ def __init__(
         else:
             self.num_proc = num_proc
 
+        self.input_template = None
+        if input_template_path:
+            input_template_config = load_prompt_config(input_template_path)
+            if "user" not in input_template_config:
+                raise KeyError(f"'user' key is missing in the YAML file: {input_template_path}")
+            self.input_template = input_template_config["user"]
+
         # Train split
         self.formatted_ds = {
             "train": self.load_or_process_split(train_ds_path, "train"),
@@ -130,11 +139,22 @@ def load_or_process_split(self, path: str, split_name: str) -> Dataset:
         print(f"[Map] Processing {split_name} dataset from: {path}")
         dataset = load_dataset("json", data_files=str(path))["train"]
 
+        current_input_key = self.input_key
+        if self.input_template:
+            assert "messages" not in dataset.column_names
+            dataset = dataset.map(
+                self.apply_input_template,
+                batched=True,
+                num_proc=self.num_proc,
+            )
+            current_input_key = "formatted_input"
+
         if "messages" not in dataset.column_names:
             dataset = dataset.map(
                 self.add_messages_key,
                 batched=True,
                 num_proc=self.num_proc,
+                fn_kwargs={"input_key": current_input_key},
             )
 
         # Save dataset + new size signature
@@ -146,17 +166,26 @@ def load_or_process_split(self, path: str, split_name: str) -> Dataset:
         print(f"[Cache] Saved {split_name} dataset to: {cache_dir}")
         return dataset
 
-    def add_messages_key(self, examples: dict[str, list[Any]]) -> dict[str, list[list[dict[str, Any]]]]:
+    def add_messages_key(
+        self, examples: dict[str, list[Any]], input_key: str
+    ) -> dict[str, list[list[dict[str, Any]]]]:
         return {
             "messages": [
                 [
                     {"role": "user", "content": input_},
                     {"role": "assistant", "content": output},
                 ]
-                for input_, output in zip(examples[self.input_key], examples[self.output_key])
+                for input_, output in zip(examples[input_key], examples[self.output_key])
             ]
         }
 
+    def apply_input_template(self, examples: dict[str, list[Any]]) -> dict[str, list[str]]:
+        keys = [k.strip() for k in self.input_key.split(";")]
+        examples["formatted_input"] = [
+            self.input_template.format(**{k: examples[k][i] for k in keys}) for i in range(len(examples[keys[0]]))
-        examples["formatted_input"] = [
-            self.input_template.format(**{k: examples[k][i] for k in keys}) for i in range(len(examples[keys[0]]))
+        formatted_inputs = []
+        for i in range(len(examples[keys[0]])):
+            format_dict = {k: examples[k][i] for k in keys}
+            formatted_inputs.append(self.input_template.format(**format_dict))
+        examples["formatted_input"] = formatted_inputs
-        examples["formatted_input"] = [
-            self.input_template.format(**{k: examples[k][i] for k in keys}) for i in range(len(examples[keys[0]]))
+        formatted_inputs = []
+        for i in range(len(examples[keys[0]])):
+            format_dict = {k: examples[k][i] for k in keys}
+            formatted_inputs.append(self.input_template.format(**format_dict))
+        examples["formatted_input"] = formatted_inputs
+        ]
+        return examples
+
 
 def parse_args():
     """Parse command line arguments."""
@@ -235,6 +264,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         data_config["input_key"],
         data_config["output_key"],
         force_reprocess=data_config.get("force_reprocess", False),
+        input_template_path=data_config.get("input_template_path", None),
     )
     print(f"  ✓ Training dataset loaded with {len(data.formatted_ds['train'])} samples.")
     if data.formatted_ds["validation"] is not None: