From 9f3bb385a97a934b3422fa8e7c9cc7a4e3dc6fde Mon Sep 17 00:00:00 2001 From: ALEXuH Date: Wed, 8 Nov 2023 08:55:07 +0800 Subject: [PATCH] adding long-form audio speaker diarization --- nemo_msdd_configs/diar_infer_general.yaml | 3 +++ nemo_msdd_configs/diar_infer_meeting.yaml | 2 ++ nemo_msdd_configs/diar_infer_telephonic.yaml | 4 +++- requirements.txt | 2 +- 4 files changed, 9 insertions(+), 2 deletions(-) mode change 100644 => 100755 nemo_msdd_configs/diar_infer_general.yaml mode change 100644 => 100755 nemo_msdd_configs/diar_infer_meeting.yaml mode change 100644 => 100755 nemo_msdd_configs/diar_infer_telephonic.yaml diff --git a/nemo_msdd_configs/diar_infer_general.yaml b/nemo_msdd_configs/diar_infer_general.yaml old mode 100644 new mode 100755 index 09e4bb3..860c77f --- a/nemo_msdd_configs/diar_infer_general.yaml +++ b/nemo_msdd_configs/diar_infer_general.yaml @@ -52,6 +52,9 @@ diarizer: max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. sparse_search_volume: 10 # The higher the number, the more values will be examined with more time. maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. + chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering. + embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) + msdd_model: model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) diff --git a/nemo_msdd_configs/diar_infer_meeting.yaml b/nemo_msdd_configs/diar_infer_meeting.yaml old mode 100644 new mode 100755 index 2d53d09..0011a85 --- a/nemo_msdd_configs/diar_infer_meeting.yaml +++ b/nemo_msdd_configs/diar_infer_meeting.yaml @@ -52,6 +52,8 @@ diarizer: max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. + chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering. + embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) msdd_model: model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) diff --git a/nemo_msdd_configs/diar_infer_telephonic.yaml b/nemo_msdd_configs/diar_infer_telephonic.yaml old mode 100644 new mode 100755 index c9d7cdf..9062101 --- a/nemo_msdd_configs/diar_infer_telephonic.yaml +++ b/nemo_msdd_configs/diar_infer_telephonic.yaml @@ -52,12 +52,14 @@ diarizer: max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. + chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering. + embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) msdd_model: model_path: diar_msdd_telephonic # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) parameters: use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used. - infer_batch_size: 25 # Batch size for MSDD inference. + infer_batch_size: 25 # Batch size for MSDD inference. sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps. seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False. split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference. diff --git a/requirements.txt b/requirements.txt index a0f0b89..2748b84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ wget -nemo_toolkit[asr]==1.20.0 +git+https://github.com/NVIDIA/NeMo.git@df9f0d1e522aec2bafec5902fe87c6ee74eaef96#egg=nemo_toolkit[asr] transformers>=4.26.1 faster-whisper==0.9.0 git+https://github.com/m-bain/whisperX.git@49e0130e4e0c0d99d60715d76e65a71826a97109