NVIDIA · jubick1337 · Oct 7, 2023 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023
diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py
@@ -25,12 +25,18 @@
 for full list of arguments >>
 
     dataset_manifest: Required - path to dataset JSON manifest file (in NeMo format)
-    output_filename: Optional - output filename where the transcriptions will be written.
+    output_filename: Optional - output filename where the transcriptions will be written. (if scores_per_sample=True, 
+    metrics per sample will be written there too)
 
     use_cer: Bool, whether to compute CER or WER
+    use_punct_er: Bool, compute dataset Punctuation Error Rate (set the punctuation marks for metrics computation with 
+    "text_processing.punctuation_marks")
+
     tolerance: Float, minimum WER/CER required to pass some arbitrary tolerance.
 
     only_score_manifest: Bool, when set will skip audio transcription and just calculate WER of provided manifest.
+    scores_per_sample: Bool, compute metrics for each sample separately (if only_score_manifest=True, scores per sample
+    will be added to the manifest at the dataset_manifest path)
 
 # Usage
 
@@ -66,7 +72,12 @@
 from omegaconf import MISSING, OmegaConf, open_dict
 
 from nemo.collections.asr.metrics.wer import word_error_rate
-from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig
+from nemo.collections.asr.parts.utils.transcribe_utils import (
+    PunctuationCapitalization,
+    TextProcessingConfig,
+    compute_metrics_per_sample,
+)
+from nemo.collections.common.metrics.punct_er import DatasetPunctuationErrorRate
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
@@ -82,9 +93,11 @@
     att_context_size: Optional[list] = None
 
     use_cer: bool = False
+    use_punct_er: bool = False
     tolerance: Optional[float] = None
 
     only_score_manifest: bool = False
+    scores_per_sample: bool = False
 
     text_processing: Optional[TextProcessingConfig] = TextProcessingConfig(
         punctuation_marks=".,?", separate_punctuation=False, do_lowercase=False, rm_punctuation=False,
@@ -154,6 +167,29 @@
             f"contain value for `pred_text`."
         )
 
+    if cfg.use_punct_er:
+        dper_obj = DatasetPunctuationErrorRate(
+            hypotheses=predicted_text,
+            references=ground_truth_text,
+            punctuation_marks=list(cfg.text_processing.punctuation_marks),
+        )
+        dper_obj.compute()
+
+    if cfg.scores_per_sample:
+        metrics_to_compute = ["wer", "cer"]
+
+        if cfg.use_punct_er:
+            metrics_to_compute.append("punct_er")
+
+        samples_with_metrics = compute_metrics_per_sample(
+            manifest_path=cfg.dataset_manifest,
+            reference_field="text",
+            hypothesis_field="pred_text",
+            metrics=metrics_to_compute,
+            punctuation_marks=cfg.text_processing.punctuation_marks,
+            output_manifest_path=cfg.output_filename,
+        )
+
     # Compute the WER
     cer = word_error_rate(hypotheses=predicted_text, references=ground_truth_text, use_cer=True)
     wer = word_error_rate(hypotheses=predicted_text, references=ground_truth_text, use_cer=False)
@@ -173,6 +209,10 @@
 
     logging.info(f'Dataset WER/CER ' + str(round(100 * wer, 2)) + "%/" + str(round(100 * cer, 2)) + "%")
 
+    if cfg.use_punct_er:
+        dper_obj.print()
+        dper_obj.reset()
+
     # Inject the metric name and score into the config, and return the entire config
     with open_dict(cfg):
         cfg.metric_name = metric_name

diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -23,9 +23,11 @@
 from tqdm.auto import tqdm
 
 import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR
+from nemo.collections.common.metrics.punct_er import OccurancePunctuationErrorRate
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.utils import logging, model_utils
 
@@ -472,6 +474,96 @@ def transcribe_partial_audio(
     return hypotheses
 
 
+def compute_metrics_per_sample(
+    manifest_path: str,
+    reference_field: str = "text",
+    hypothesis_field: str = "pred_text",
+    metrics: list[str] = ["wer"],
+    punctuation_marks: list[str] = [".", ",", "?"],
+    output_manifest_path: str = None,
+) -> dict:
+
+    '''
+    Computes metrics per sample for given manifest
+
+    Args:
+        manifest_path: str, Required - path to dataset JSON manifest file (in NeMo format)
+        reference_field: str, Optional - name of field in .json manifest with the reference text ("text" by default).
+        hypothesis_field: str, Optional - name of field in .json manifest with the hypothesis text ("pred_text" by default).
+        metrics: list[str], Optional - list of metrics to be computed (currently supported "wer", "cer", "punct_er")
+        punctuation_marks: list[str], Optional - list of punctuation marks for computing punctuation error rate ([".", ",", "?"] by default).
+        output_manifest_path: str, Optional - path where .json manifest with calculated metrics will be saved.
+
+    Returns:
+        samples: dict - Dict of samples with calculated metrics
+    '''
+
+    supported_metrics = ["wer", "cer", "punct_er"]
+
+    if len(metrics) == 0:
+        raise AssertionError(
+            f"'metrics' list is empty. \
+            Select the metrics from the supported: {supported_metrics}."
+        )
+
+    for metric in metrics:
+        if metric not in supported_metrics:
+            raise AssertionError(
+                f"'{metric}' metric is not supported. \
+                Currently supported metrics are {supported_metrics}."
+            )
+
+    if "punct_er" in metrics:
+        if len(punctuation_marks) == 0:
+            raise AssertionError("punctuation_marks list can't be empty when 'punct_er' metric is enabled.")
+        else:
+            oper_obj = OccurancePunctuationErrorRate(punctuation_marks=punctuation_marks)
+
+    use_wer = "wer" in metrics
+    use_cer = "cer" in metrics
+    use_punct_er = "punct_er" in metrics
+
+    with open(manifest_path, 'r') as manifest:
+        lines = manifest.readlines()
+        samples = [json.loads(line) for line in lines]
+        samples_with_metrics = []
+
+        logging.info(f"Computing {', '.join(metrics)} per sample")
+
+        for sample in tqdm(samples):
+            reference = sample[reference_field]
+            hypothesis = sample[hypothesis_field]
+
+            if use_wer:
+                sample_wer = word_error_rate(hypotheses=[hypothesis], references=[reference], use_cer=False)
+                sample["wer"] = round(100 * sample_wer, 2)
+
+            if use_cer:
+                sample_cer = word_error_rate(hypotheses=[hypothesis], references=[reference], use_cer=True)
+                sample["cer"] = round(100 * sample_cer, 2)
+
+            if use_punct_er:
+                operation_amounts, substitution_amounts, punctuation_rates = oper_obj.compute(
+                    reference=reference, hypothesis=hypothesis
+                )
+                sample["punct_correct_rate"] = round(100 * punctuation_rates.correct_rate, 2)
+                sample["punct_deletions_rate"] = round(100 * punctuation_rates.deletions_rate, 2)
+                sample["punct_insertions_rate"] = round(100 * punctuation_rates.insertions_rate, 2)
+                sample["punct_substitutions_rate"] = round(100 * punctuation_rates.substitutions_rate, 2)
+                sample["punct_error_rate"] = round(100 * punctuation_rates.punct_er, 2)
+
+            samples_with_metrics.append(sample)
+
+    if output_manifest_path is not None:
+        with open(output_manifest_path, 'w') as output:
+            for sample in samples_with_metrics:
+                line = json.dumps(sample)
+                output.writelines(f'{line}\n')
+        logging.info(f'Output manifest saved: {output_manifest_path}')
+
+    return samples_with_metrics
+
+
 class PunctuationCapitalization:
     def __init__(self, punctuation_marks: str):
         """