.ctm in data simulator annotator compliant with RT-09 specification (N…

…VIDIA#8004) * .ctm fix for data simulation Signed-off-by: popcornell <[email protected]> * .ctm fix, channel should be 1 not 0 Signed-off-by: popcornell <[email protected]> * .ctm fix, only two na, type and confidence Signed-off-by: popcornell <[email protected]> * Revised all the parts in NeMo touching CTM files Signed-off-by: Taejin Park <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated tutorial, nemo-docs and tests for CTM formats Signed-off-by: Taejin Park <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed the docstrings in create_alignment_manifest.py Signed-off-by: Taejin Park <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Some missing refactored variables for type_of_token Signed-off-by: Taejin Park <[email protected]> * Another un-fixed part in data_simulation_utils.py Signed-off-by: Taejin Park <[email protected]> * Reflected comments from PR Signed-off-by: Taejin Park <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Reflected another precision related comments from PR Signed-off-by: Taejin Park <[email protected]> * Updated tests to use decimal rounding of 2 Signed-off-by: Taejin Park <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Changed beg_time to start_time and fixed unit tests Signed-off-by: Taejin Park <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed typos and errors in manifest_utils.py Signed-off-by: Taejin Park <[email protected]> * Resolved another merge conflict Signed-off-by: Taejin Park <[email protected]> * Fixed the test errors Signed-off-by: Taejin Park <[email protected]> * Fixed the missed commented lines Signed-off-by: Taejin Park <[email protected]> --------- Signed-off-by: popcornell <[email protected]> Signed-off-by: Taejin Park <[email protected]> Co-authored-by: Taejin Park <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: He Huang (Steve) <[email protected]> Signed-off-by: Sasha Meister <[email protected]>
ssh-meister · Feb 15, 2024 · 4e472b7 · 4e472b7
1 parent e2612c1
commit 4e472b7
Show file tree

Hide file tree

Showing 8 changed files with 376 additions and 41 deletions.
diff --git a/docs/source/asr/speaker_diarization/datasets.rst b/docs/source/asr/speaker_diarization/datasets.rst
@@ -205,14 +205,14 @@ The following are descriptions about each field in an input manifest JSON file.
 
 ``ctm_filepath`` (Optional):
 
-  CTM file is used for the evaluation of word-level diarization results and word-timestamp alignment. CTM file follows the following convention: ``<uniq-id> <speaker ID> <word start time> <word end time> <word> <confidence>`` Since confidence is not required for evaluating diarization results, it can have any value. Note that the ``<speaker_id>`` should be exactly matched with speaker IDs in RTTM. 
+  The CTM file is used for the evaluation of word-level diarization results and word-timestamp alignment. The CTM file follows this convention: ``<session name> <channel ID> <start time> <duration> <word> <confidence> <type of token> <speaker>``. Note that the ``<speaker>`` should exactly match speaker IDs in RTTM. Since confidence is not required for evaluating diarization results, we assign ``<confidence>`` the value ``NA``. If the type of token is words, we assign ``<type of token>`` as ``lex``.  
 
   Example lines of CTM file:
 
 .. code-block:: bash
   
-   TS3012d.Mix-Headset MTD046ID 12.879 0.32 okay 0
-   TS3012d.Mix-Headset MTD046ID 13.203 0.24 yeah 0
+   TS3012d.Mix-Headset 1 12.879 0.32 okay NA lex MTD046ID
+   TS3012d.Mix-Headset 1 13.203 0.24 yeah NA lex MTD046ID
 
 
 Evaluation on Benchmark Datasets

diff --git a/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py b/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py
@@ -63,8 +63,6 @@ def main(cfg):
 
     # If RTTM is provided and DER evaluation
     if diar_score is not None:
-        metric, mapping_dict, _ = diar_score
-
         # Get session-level diarization error rate and speaker counting error
         der_results = OfflineDiarWithASR.gather_eval_results(
             diar_score=diar_score,

diff --git a/nemo/collections/asr/parts/utils/data_simulation_utils.py b/nemo/collections/asr/parts/utils/data_simulation_utils.py
@@ -25,7 +25,13 @@
 
 from nemo.collections.asr.parts.preprocessing.perturb import AudioAugmentor
 from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
-from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_ctm, write_manifest, write_text
+from nemo.collections.asr.parts.utils.manifest_utils import (
+    get_ctm_line,
+    read_manifest,
+    write_ctm,
+    write_manifest,
+    write_text,
+)
 from nemo.collections.asr.parts.utils.speaker_utils import labels_to_rttmfile
 from nemo.utils import logging
 
@@ -774,7 +780,16 @@ def create_new_ctm_entry(
                 prev_align = 0 if i == 0 else alignments[i - 1]
                 align1 = round(float(prev_align + start), self._params.data_simulator.outputs.output_precision)
                 align2 = round(float(alignments[i] - prev_align), self._params.data_simulator.outputs.output_precision)
-                text = f"{session_name} {speaker_id} {align1} {align2} {word} 0\n"
+                text = get_ctm_line(
+                    source=session_name,
+                    channel=1,
+                    start_time=align1,
+                    duration=align2,
+                    token=word,
+                    conf=None,
+                    type_of_token='lex',
+                    speaker=speaker_id,
+                )
                 arr.append((align1, text))
         return arr
 

diff --git a/nemo/collections/asr/parts/utils/manifest_utils.py b/nemo/collections/asr/parts/utils/manifest_utils.py
@@ -33,6 +33,103 @@
 from nemo.utils.data_utils import DataStoreObject
 
 
+def get_rounded_str_float(num: float, output_precision: int, min_precision=1, max_precision=3) -> str:
+    """
+    Get a string of a float number with rounded precision.
+
+    Args:
+        num (float): float number to round
+        output_precision (int): precision of the output floating point number
+        min_precision (int, optional): Minimum precision of the output floating point number. Defaults to 1.
+        max_precision (int, optional): Maximum precision of the output floating point number. Defaults to 3.
+
+    Returns:
+        (str): Return a string of a float number with rounded precision.
+    """
+    output_precision = min(max_precision, max(min_precision, output_precision))
+    return f"{num:.{output_precision}f}"
+
+
+def get_ctm_line(
+    source: str,
+    channel: int,
+    start_time: float,
+    duration: float,
+    token: str,
+    conf: float,
+    type_of_token: str,
+    speaker: str,
+    NA_token: str = 'NA',
+    UNK: str = 'unknown',
+    default_channel: str = '1',
+    output_precision: int = 2,
+) -> str:
+    """
+    Get a line in Conversation Time Mark (CTM) format. Following CTM format appeared in `Rich Transcription Meeting Eval Plan: RT09` document.
+    
+    CTM Format: 
+        <SOURCE><SP><CHANNEL><SP><BEG-TIME><SP><DURATION><SP><TOKEN><SP><CONF><SP><TYPE><SP><SPEAKER><NEWLINE>
+    
+    Reference: 
+        https://web.archive.org/web/20170119114252/http://www.itl.nist.gov/iad/mig/tests/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf
+
+    Args:
+        source (str): <SOURCE> is name of the source file, session name or utterance ID
+        channel (int): <CHANNEL> is channel number defaults to 1
+        start_time (float): <BEG_TIME> is the begin time of the word, which we refer to as `start_time` in NeMo.
+        duration (float): <DURATION> is duration of the word
+        token (str): <TOKEN> Token or word for the current entry
+        conf (float): <CONF> is a floating point number between 0 (no confidence) and 1 (certainty). A value of “NA” is used (in CTM format data) 
+                      when no confidence is computed and in the reference data. 
+        type_of_token (str): <TYPE> is the token type. The legal values of <TYPE> are “lex”, “frag”, “fp”, “un-lex”, “for-lex”, “non-lex”, “misc”, or “noscore”
+        speaker (str): <SPEAKER> is a string identifier for the speaker who uttered the token. This should be “null” for non-speech tokens and “unknown” when
+                       the speaker has not been determined. 
+        NA_token (str, optional): A token for  . Defaults to '<NA>'.
+        output_precision (int, optional): The precision of the output floating point number. Defaults to 3.
+
+    Returns:
+        str: Return a line in CTM format filled with the given information.
+    """
+    VALID_TOKEN_TYPES = ["lex", "frag", "fp", "un-lex", "for-lex", "non-lex", "misc", "noscore"]
+
+    if type(start_time) == str and start_time.replace('.', '', 1).isdigit():
+        start_time = float(start_time)
+    elif type(start_time) != float:
+        raise ValueError(f"`start_time` must be a float or str containing float, but got {type(start_time)}")
+
+    if type(duration) == str and duration.replace('.', '', 1).isdigit():
+        duration = float(duration)
+    elif type(duration) != float:
+        raise ValueError(f"`duration` must be a float or str containing float, but got {type(duration)}")
+
+    if type(conf) == str and conf.replace('.', '', 1).isdigit():
+        conf = float(conf)
+    elif conf is None:
+        conf = NA_token
+    elif type(conf) != float:
+        raise ValueError(f"`conf` must be a float or str containing float, but got {type(conf)}")
+
+    if channel is not None and type(channel) != int:
+        channel = str(channel)
+    if conf is not None and type(conf) == float and not (0 <= conf <= 1):
+        raise ValueError(f"`conf` must be between 0 and 1, but got {conf}")
+    if type_of_token is not None and type(type_of_token) != str:
+        raise ValueError(f"`type` must be a string, but got {type(type_of_token)} type {type_of_token}")
+    if type_of_token is not None and type_of_token not in VALID_TOKEN_TYPES:
+        raise ValueError(f"`type` must be one of {VALID_TOKEN_TYPES}, but got {type_of_token} type {type_of_token}")
+    if speaker is not None and type(speaker) != str:
+        raise ValueError(f"`speaker` must be a string, but got {type(speaker)}")
+
+    channel = default_channel if channel is None else channel
+    conf = NA_token if conf is None else conf
+    speaker = NA_token if speaker is None else speaker
+    type_of_token = UNK if type_of_token is None else type_of_token
+    start_time = get_rounded_str_float(start_time, output_precision)
+    duration = get_rounded_str_float(duration, output_precision)
+    conf = get_rounded_str_float(conf, output_precision) if conf != NA_token else conf
+    return f"{source} {channel} {start_time} {duration} {token} {conf} {type_of_token} {speaker}\n"
+
+
 def rreplace(s: str, old: str, new: str) -> str:
     """
     Replace end of string.

diff --git a/scripts/speaker_tasks/create_alignment_manifest.py b/scripts/speaker_tasks/create_alignment_manifest.py
@@ -16,12 +16,41 @@
 import os
 import shutil
 from pathlib import Path
+from typing import List
 
-from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_ctm, write_manifest
+from nemo.collections.asr.parts.utils.manifest_utils import get_ctm_line, read_manifest, write_ctm, write_manifest
 from nemo.utils import logging
 
 
-def get_unaligned_files(unaligned_path):
+def get_seg_info_from_ctm_line(
+    ctm_list: List[str],
+    output_precision: int,
+    speaker_index: int = 7,
+    start_time_index: int = 2,
+    duration_index: int = 3,
+):
+    """
+    Get time stamp information and speaker labels from CTM lines.
+    This is following CTM format appeared in `Rich Transcription Meeting Eval Plan: RT09` document.
+    
+    Args:
+        ctm_list (list): List containing CTM items. e.g.: ['sw02001-A', '1', '0.000', '0.200', 'hello', '0.98', 'lex', 'speaker3']
+        output_precision (int): Precision for CTM outputs in integer.
+
+    Returns:
+        start (float): Start time of the segment.
+        end (float): End time of the segment.
+        speaker_id (str): Speaker ID of the segment.
+    """
+    speaker_id = ctm_list[speaker_index]
+    start = float(ctm_list[start_time_index])
+    end = float(ctm_list[start_time_index]) + float(ctm_list[duration_index])
+    start = round(start, output_precision)
+    end = round(end, output_precision)
+    return start, end, speaker_id
+
+
+def get_unaligned_files(unaligned_path: str) -> List[str]:
     """
     Get files without alignments in order to filter them out (as they cannot be used for data simulation).
     In the unaligned file, each line contains the file name and the reason for the unalignment, if necessary to specify.
@@ -71,7 +100,17 @@ def create_new_ctm_entry(session_name, speaker_id, wordlist, alignments, output_
             # note that using the current alignments the first word is always empty, so there is no error from indexing the array with i-1
             align1 = float(round(alignments[i - 1], output_precision))
             align2 = float(round(alignments[i] - alignments[i - 1], output_precision,))
-            text = f"{session_name} {speaker_id} {align1} {align2} {word} 0\n"
+            text = get_ctm_line(
+                source=session_name,
+                channel=speaker_id,
+                start_time=align1,
+                duration=align2,
+                token=word,
+                conf=0,
+                type_of_token='lex',
+                speaker=speaker_id,
+                output_precision=output_precision,
+            )
             arr.append((align1, text))
     return arr
 
@@ -206,11 +245,7 @@ def create_manifest_with_alignments(
         prev_end = 0
         for i in range(len(lines)):
             ctm = lines[i].split(' ')
-            speaker_id = ctm[1]
-            start = float(ctm[2])
-            end = float(ctm[2]) + float(ctm[3])
-            start = round(start, output_precision)
-            end = round(end, output_precision)
+            speaker_id, start, end = get_seg_info_from_ctm_line(ctm_list=ctm, output_precision=output_precision)
             interval = start - prev_end
 
             if (i == 0 and interval > 0) or (i > 0 and interval > silence_dur_threshold):
@@ -231,13 +266,16 @@ def create_manifest_with_alignments(
             end_times.append(f['duration'])
 
         # build target manifest entry
-        target_manifest.append({})
-        target_manifest[tgt_i]['audio_filepath'] = f['audio_filepath']
-        target_manifest[tgt_i]['duration'] = f['duration']
-        target_manifest[tgt_i]['text'] = f['text']
-        target_manifest[tgt_i]['words'] = words
-        target_manifest[tgt_i]['alignments'] = end_times
-        target_manifest[tgt_i]['speaker_id'] = speaker_id
+        target_manifest.append(
+            {
+                'audio_filepath': f['audio_filepath'],
+                'duration': f['duration'],
+                'text': f['text'],
+                'words': words,
+                'alignments': end_times,
+                'speaker_id': speaker_id,
+            }
+        )
 
         src_i += 1
         tgt_i += 1