Fix AST eval (NVIDIA#8112)

stevehuang52 · fayejf · ssh-meister · commit 941282d399c9 · 2024-02-15T06:37:29.000-08:00
* add text metrics to asr eval

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;

* temporary fix for EncDecTransfModelBPE

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;

* fix bleu eval

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;

* fix typo

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;

---------

Signed-off-by: stevehuang52 &lt;heh@nvidia.com&gt;
Co-authored-by: fayejf &lt;36722593+fayejf@users.noreply.github.com&gt;
Signed-off-by: Sasha Meister &lt;ameister@nvidia.com&gt;
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -175,6 +175,9 @@ class TranscriptionConfig:
     # Set to False to return text instead of hypotheses from the transcribe function, so as to save memory
     return_hypotheses: bool = True
 
+    # key for groundtruth text in manifest
+    gt_text_attr_name: str = "text"
+
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis]]:
@@ -370,6 +373,7 @@ def autocast(dtype=None):
     if cfg.calculate_wer:
         output_manifest_w_wer, total_res, _ = cal_write_wer(
             pred_manifest=output_filename,
+            gt_text_attr_name=cfg.gt_text_attr_name,
             pred_text_attr_name=pred_text_attr_name,
             clean_groundtruth_text=cfg.clean_groundtruth_text,
             langid=cfg.langid,
diff --git a/nemo/collections/asr/parts/utils/eval_utils.py b/nemo/collections/asr/parts/utils/eval_utils.py
@@ -106,6 +106,7 @@ def convert_num_to_words(_str: str, langid: str = "en") -> str:
 
 def cal_write_wer(
     pred_manifest: str = None,
+    gt_text_attr_name: str = "text",
     pred_text_attr_name: str = "pred_text",
     clean_groundtruth_text: bool = False,
     langid: str = 'en',
@@ -128,14 +129,17 @@ def cal_write_wer(
         for line in fp:
             sample = json.loads(line)
 
-            if 'text' not in sample:
-                logging.info(
-                    "ground-truth text is not present in manifest! Cannot calculate Word Error Rate. Returning!"
-                )
+            if gt_text_attr_name not in sample:
+                if "text" in sample:
+                    gt_text_attr_name = "text"
+                else:
+                    logging.info(
+                        f"ground-truth text attribute {gt_text_attr_name} is not present in manifest! Cannot calculate WER. Returning!"
+                    )
                 return None, None, eval_metric
 
-            hyp = sample[pred_text_attr_name]
-            ref = sample['text']
+            hyp = sample[pred_text_attr_name].strip()
+            ref = sample[gt_text_attr_name].strip()
 
             if clean_groundtruth_text:
                 ref = clean_label(ref, langid=langid)
@@ -211,13 +215,16 @@ def cal_write_text_metric(
             sample = json.loads(line)
 
             if gt_text_attr_name not in sample:
-                logging.info(
-                    f"ground-truth text attribute {pred_text_attr_name} is not present in manifest! Cannot calculate {metric}. Returning!"
-                )
+                if "text" in sample:
+                    gt_text_attr_name = "text"
+                else:
+                    logging.info(
+                        f"ground-truth text attribute {gt_text_attr_name} is not present in manifest! Cannot calculate {metric}. Returning!"
+                    )
                 return None, None, metric
 
-            hyp = sample[pred_text_attr_name]
-            ref = sample['text']
+            hyp = sample[pred_text_attr_name].strip()
+            ref = sample[gt_text_attr_name].strip()
 
             if ignore_punctuation:
                 ref = remove_punctuations(ref, punctuations=punctuations)
@@ -227,13 +234,18 @@ def cal_write_text_metric(
                 ref = ref.lower()
                 hyp = hyp.lower()
 
-            score = metric_calculator(hyp, ref).item()
+            if metric == 'bleu':
+                score = metric_calculator([hyp], [[ref]]).item()
+            else:
+                score = metric_calculator(hyp, ref).item()
             sample[metric] = score  # evaluatin metric, could be word error rate of character error rate
 
             samples.append(sample)
             hyps.append(hyp)
             refs.append(ref)
 
+    if metric == 'bleu':
+        refs = [[ref] for ref in refs]
     total_score = metric_calculator(hyps, refs).item()
 
     if not output_filename:
diff --git a/tools/asr_evaluator/asr_evaluator.py b/tools/asr_evaluator/asr_evaluator.py
@@ -66,6 +66,8 @@ def main(cfg):
     if cfg.analyst.metric_calculator.get("metric", "wer") == "wer":
         output_manifest_w_wer, total_res, eval_metric = cal_write_wer(
             pred_manifest=cfg.engine.output_filename,
+            gt_text_attr_name=cfg.analyst.metric_calculator.get("gt_text_attr_name", "text"),
+            pred_text_attr_name=cfg.analyst.metric_calculator.get("pred_text_attr_name", "pred_text"),
             clean_groundtruth_text=cfg.analyst.metric_calculator.clean_groundtruth_text,
             langid=cfg.analyst.metric_calculator.langid,
             use_cer=cfg.analyst.metric_calculator.use_cer,