diff --git a/nemo_msdd_configs/diar_infer_general.yaml b/nemo_msdd_configs/diar_infer_general.yaml old mode 100644 new mode 100755 index 09e4bb3..860c77f --- a/nemo_msdd_configs/diar_infer_general.yaml +++ b/nemo_msdd_configs/diar_infer_general.yaml @@ -52,6 +52,9 @@ diarizer: max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. sparse_search_volume: 10 # The higher the number, the more values will be examined with more time. maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. + chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering. + embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) + msdd_model: model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) diff --git a/nemo_msdd_configs/diar_infer_meeting.yaml b/nemo_msdd_configs/diar_infer_meeting.yaml old mode 100644 new mode 100755 index 2d53d09..0011a85 --- a/nemo_msdd_configs/diar_infer_meeting.yaml +++ b/nemo_msdd_configs/diar_infer_meeting.yaml @@ -52,6 +52,8 @@ diarizer: max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. + chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering. + embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) msdd_model: model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) diff --git a/nemo_msdd_configs/diar_infer_telephonic.yaml b/nemo_msdd_configs/diar_infer_telephonic.yaml old mode 100644 new mode 100755 index c9d7cdf..9062101 --- a/nemo_msdd_configs/diar_infer_telephonic.yaml +++ b/nemo_msdd_configs/diar_infer_telephonic.yaml @@ -52,12 +52,14 @@ diarizer: max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. + chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering. + embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) msdd_model: model_path: diar_msdd_telephonic # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) parameters: use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used. - infer_batch_size: 25 # Batch size for MSDD inference. + infer_batch_size: 25 # Batch size for MSDD inference. sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps. seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False. split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference.