From 9f3bb385a97a934b3422fa8e7c9cc7a4e3dc6fde Mon Sep 17 00:00:00 2001
From: ALEXuH <xuzhiceng@Gmail.com>
Date: Wed, 8 Nov 2023 08:55:07 +0800
Subject: [PATCH] adding long-form audio speaker diarization

---
 nemo_msdd_configs/diar_infer_general.yaml    | 3 +++
 nemo_msdd_configs/diar_infer_meeting.yaml    | 2 ++
 nemo_msdd_configs/diar_infer_telephonic.yaml | 4 +++-
 requirements.txt                             | 2 +-
 4 files changed, 9 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 nemo_msdd_configs/diar_infer_general.yaml
 mode change 100644 => 100755 nemo_msdd_configs/diar_infer_meeting.yaml
 mode change 100644 => 100755 nemo_msdd_configs/diar_infer_telephonic.yaml

diff --git a/nemo_msdd_configs/diar_infer_general.yaml b/nemo_msdd_configs/diar_infer_general.yaml
old mode 100644
new mode 100755
index 09e4bb3..860c77f
--- a/nemo_msdd_configs/diar_infer_general.yaml
+++ b/nemo_msdd_configs/diar_infer_general.yaml
@@ -52,6 +52,9 @@ diarizer:
       max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
       sparse_search_volume: 10 # The higher the number, the more values will be examined with more time. 
       maj_vote_spk_count: False  # If True, take a majority vote on multiple p-values to estimate the number of speakers.
+      chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering.
+      embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) 
+
   
   msdd_model:
     model_path: null  # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
diff --git a/nemo_msdd_configs/diar_infer_meeting.yaml b/nemo_msdd_configs/diar_infer_meeting.yaml
old mode 100644
new mode 100755
index 2d53d09..0011a85
--- a/nemo_msdd_configs/diar_infer_meeting.yaml
+++ b/nemo_msdd_configs/diar_infer_meeting.yaml
@@ -52,6 +52,8 @@ diarizer:
       max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
       sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. 
       maj_vote_spk_count: False  # If True, take a majority vote on multiple p-values to estimate the number of speakers.
+      chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering.
+      embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) 
   
   msdd_model:
     model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
diff --git a/nemo_msdd_configs/diar_infer_telephonic.yaml b/nemo_msdd_configs/diar_infer_telephonic.yaml
old mode 100644
new mode 100755
index c9d7cdf..9062101
--- a/nemo_msdd_configs/diar_infer_telephonic.yaml
+++ b/nemo_msdd_configs/diar_infer_telephonic.yaml
@@ -52,12 +52,14 @@ diarizer:
       max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
       sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. 
       maj_vote_spk_count: False  # If True, take a majority vote on multiple p-values to estimate the number of speakers.
+      chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering.
+      embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) 
   
   msdd_model:
     model_path: diar_msdd_telephonic # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
     parameters:
       use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used.
-      infer_batch_size: 25 # Batch size for MSDD inference. 
+      infer_batch_size: 25 # Batch size for MSDD inference.
       sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps.
       seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False.
       split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference.
diff --git a/requirements.txt b/requirements.txt
index a0f0b89..2748b84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 wget
-nemo_toolkit[asr]==1.20.0
+git+https://github.com/NVIDIA/NeMo.git@df9f0d1e522aec2bafec5902fe87c6ee74eaef96#egg=nemo_toolkit[asr]
 transformers>=4.26.1
 faster-whisper==0.9.0
 git+https://github.com/m-bain/whisperX.git@49e0130e4e0c0d99d60715d76e65a71826a97109