Skip to content

Commit

Permalink
Update the _convert_pitch function in block_lpcnet. Update data IO
Browse files Browse the repository at this point in the history
  • Loading branch information
TonyWangX committed Apr 22, 2023
1 parent d676f87 commit 1e68bf5
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 42 deletions.
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,14 @@ git clone --depth 1 https://github.com/nii-yamagishilab/project-NN-Pytorch-scrip
```

* Latest updates:
1. Code, databases, and resources for the paper below were added. Please check [project/09-asvspoof-vocoded-trn/](project/09-asvspoof-vocoded-trn/) for more details.
1. Neural vocoders pretrained on VoxCeleb2 dev and other datasets are available in tutorial notebook **chapter_a3.ipynb** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing)
2. Code, databases, and resources for the paper below were added. Please check [project/09-asvspoof-vocoded-trn/](project/09-asvspoof-vocoded-trn/) for more details.
> Xin Wang, and Junichi Yamagishi. Spoofed training data for speech spoofing countermeasure can be efficiently created using neural vocoders. Proc. ICASSP 2023, accepted. https://arxiv.org/abs/2210.10570
2. Code for the paper for the paper below were added. Please check [project/08-asvspoof-activelearn](project/08-asvspoof-activelearn) for more details.
3. Code for the paper for the paper below were added. Please check [project/08-asvspoof-activelearn](project/08-asvspoof-activelearn) for more details.
> Xin Wang, and Junichi Yamagishi. Investigating Active-Learning-Based Training Data Selection for Speech Spoofing Countermeasure. In Proc. SLT, accepted. 2023.
3. Neural vocoders pretrained on VoxCeleb2 dev and other datasets are available in tutorial notebook **chapter_a3.ipynb** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing)
3. Pointer to tutorials on neural vocoders were moved to [./tutorials/b1_neural_vocoder](./tutorials/b1_neural_vocoder/README.md).
4. Pointer to tutorials on neural vocoders were moved to [./tutorials/b1_neural_vocoder](./tutorials/b1_neural_vocoder/README.md).

4. All pre-trained models were moved to [Zenodo](https://doi.org/10.5281/zenodo.6349636).

5. Move from pytorch-1.6 to pytoch-1.7
5. All pre-trained models were moved to [Zenodo](https://doi.org/10.5281/zenodo.6349636).

## Contents

Expand Down
5 changes: 5 additions & 0 deletions core_scripts/config_parse/arg_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@ def f_args_parsed(argument_input = None):
mes = "External directory to store cache file dic"
parser.add_argument('--path-cache-file', type=str, default="", help=mes)

mes = "Skip scanning data directories (by default False)"
parser.add_argument('--force-skip-datadir-scanning',
action='store_true', default=False, help=mes)


######
# options to save model / checkpoint
parser.add_argument('--save-model-dir', type=str, \
Expand Down
60 changes: 34 additions & 26 deletions core_scripts/data_io/default_data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,9 @@ def _data_len_reader(file_path):
"""
file_name, file_ext = os.path.splitext(file_path)
if file_ext == '.wav':
sr, data = nii_wav_tk.waveReadAsFloat(file_path)
length = data.shape[0]
#sr, data = nii_wav_tk.waveReadAsFloat(file_path)
#length = data.shape[0]
length = nii_wav_tk.readWaveLength(file_path)
elif file_ext == '.flac':
sr, data = nii_wav_tk.flacReadAsFloat(file_path)
length = data.shape[0]
Expand Down Expand Up @@ -206,9 +207,11 @@ def _tmp_f(list2, default_value, length):
if global_arg is not None:
self.m_ignore_length_invalid = global_arg.ignore_length_invalid_data
self.m_ignore_cached_finfo = global_arg.ignore_cached_file_infor
self.m_force_skip_scanning = global_arg.force_skip_datadir_scanning
else:
self.m_ignore_length_invalid = False
self.m_ignore_cached_finfo = False
self.m_force_skip_scanning = False

# check augmentation funcctions
if input_augment_funcs:
Expand Down Expand Up @@ -474,9 +477,9 @@ def __getitem__(self, idx_input):
if in_data.shape[0] != tmp_d[s_idx:e_idx].shape[0]:
mes = 'Expected length is {:d}.\n'.format(e_idx-s_idx)
mes += "Loaded length "+str(tmp_d[s_idx:e_idx].shape[0])
mes += 'This may be due to an incompatible cache *.dic.'
mes += '\nPlease check the length in *.dic\n'
mes += 'Please delete it if the cached length is wrong.'
mes += '\nThis may be due to an incompatible cache *.dic.'
mes += '\nPlease check the length in *.dic'
mes += '\nPlease delete it if the cached length is wrong.'
nii_warn.f_print(mes)
nii_warn.f_die("fail to load {:s}".format(file_name))
else:
Expand Down Expand Up @@ -820,26 +823,31 @@ def f_check_file_list(self, data_len_buf_path):
return

# check the list of files exist in all input/output directories
for tmp_d, tmp_e in zip(self.m_input_dirs, self.m_input_exts):
tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e, flag_recur)
tmp_new_list = nii_list_tools.common_members(tmp_list,
self.m_file_list)
if len(tmp_new_list) < 1:
nii_warn.f_print("Possible error when scanning:", 'error')
nii_warn.f_print(" {:s} for {:s}".format(tmp_d, tmp_e), 'error')
nii_warn.f_print('Some file names to be scanned:', 'error')
nii_warn.f_print(' ' + ' '.join(self.m_file_list[0:10]),'error')
if self.m_file_list[0].endswith(tmp_e):
nii_warn.f_print('Names should not have {:s}'.format(tmp_e))
if os.path.isfile(self.m_file_list[0]):
mes = "The above name seems not to be the data name. "
mes += "It seems to be a file path. "
mes += "\nPlease check test_list, trn_list, val_list."
nii_warn.f_print(mes, 'error')
self.m_file_list = tmp_new_list
break
else:
self.m_file_list = tmp_new_list
if not self.m_force_skip_scanning:
for tmp_d, tmp_e in zip(self.m_input_dirs, self.m_input_exts):
# read a file list from the input directory
tmp_list = nii_list_tools.listdir_with_ext(
tmp_d, tmp_e, flag_recur)
# get the common set of the existing files and those in list
tmp_new_list = nii_list_tools.common_members(
tmp_list, self.m_file_list)

if len(tmp_new_list) < 1:
nii_warn.f_print("Possible error when scanning:", 'error')
nii_warn.f_print(" {:s} for {:s}".format(tmp_d, tmp_e), 'error')
nii_warn.f_print('Some file names to be scanned:', 'error')
nii_warn.f_print(' ' + ' '.join(self.m_file_list[0:10]),'error')
if self.m_file_list[0].endswith(tmp_e):
nii_warn.f_print('Names should not have {:s}'.format(tmp_e))
if os.path.isfile(self.m_file_list[0]):
mes = "The above name seems not to be the data name. "
mes += "It seems to be a file path. "
mes += "\nPlease check test_list, trn_list, val_list."
nii_warn.f_print(mes, 'error')
self.m_file_list = tmp_new_list
break
else:
self.m_file_list = tmp_new_list

if len(self.m_file_list) < 1:
nii_warn.f_print("\nNo input features found after scanning",'error')
Expand All @@ -853,7 +861,7 @@ def f_check_file_list(self, data_len_buf_path):
nii_warn.f_die("Failed to read input features")

# check output files if necessary
if self.m_output_dirs:
if self.m_output_dirs and not self.m_force_skip_scanning:
for tmp_d, tmp_e in zip(self.m_output_dirs, \
self.m_output_exts):
tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e,
Expand Down
69 changes: 63 additions & 6 deletions core_scripts/data_io/wav_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import os
import sys
import numpy as np
import wave
import scipy.io.wavfile
try:
import soundfile
Expand Down Expand Up @@ -206,6 +207,20 @@ def flacReadAsFloat(wavFileIn):
return sr, x


def readWaveLength(wavFileIn):
""" length = readWaveLength(wavFileIn)
Read the length of the waveform
Input:
waveFile, str, path to the input waveform
Return:
length, int, length of waveform
"""
with wave.open(wavFileIn, 'rb') as file_ptr:
wavlength = file_ptr.getnframes()
return wavlength


def buffering(x, n, p=0, opt=None):
"""buffering(x, n, p=0, opt=None)
input
Expand Down Expand Up @@ -277,14 +292,16 @@ def silence_handler(wav, sr, fl=320, fs=80,
shortest_len_in_ms=50,
flag_output=0,
flag_norm_amp=True,
flag_only_startend_sil=False):
flag_only_startend_sil = False,
opt_silence_handler = -1):
"""silence_handler(wav, sr, fl=320, fs=80,
max_thres_below=30,
min_thres=-55,
shortest_len_in_ms=50,
flag_output=0,
flag_norm_amp=True,
flag_only_startend_sil=False)
flag_only_startend_sil = False,
opt_silence_handler = 1)
Based on the Speech activity detector mentioned in Sec5.1 of
Tomi Kinnunen, and Haizhou Li.
Expand All @@ -311,10 +328,16 @@ def silence_handler(wav, sr, fl=320, fs=80,
segment less than this length is treated as speech
flag_norm_amp: bool, whether normalize the waveform amplitude
based on window function (default True)
flag_only_startend_sil: bool, whether only consider silence in
flag_only_startend_sil (obsolete): bool, whether only consider silence in
the begining and end. If False, silence within the utterance
will be marked / removed (default False)
opt_silence_handler: int, option to silence trim handler
0: equivalent to flag_only_startend_sil = False
1: equivalent to flag_only_startend_sil = True
2: remove only silence between words
-1: not use this option, but follow flag_only_startend_sil
output
------
wav_no_sil: np.array, (length_1, ), waveform after removing silence
Expand Down Expand Up @@ -373,9 +396,26 @@ def ignore_short_seg(frame_tag, seg_len_thres):
# remove short nonsil segments
frame_process_all = ignore_short_seg(frame_process_sil, seg_len_thres)
frame_tag = frame_process_all


# if only consder silence in the front and end
if flag_only_startend_sil:
if opt_silence_handler < 0:
# if only consder silence in the front and end
if flag_only_startend_sil:
tmp_nonzero = np.flatnonzero(frame_tag)

# start of the first nonsil segment
#start_nonsil = np.asarray(frame_tag == 1).nonzero()[0]
if np.any(tmp_nonzero):
start_nonsil = np.flatnonzero(frame_tag)[0]
# end of the last nonsil segment
end_nonsil = np.flatnonzero(frame_tag)[-1]
# all segments between are switched to nonsil
frame_tag[start_nonsil:end_nonsil] = 1
else:
# no non-silence data, just let it pass
pass
elif opt_silence_handler == 1:
# if only consder silence in the front and end
tmp_nonzero = np.flatnonzero(frame_tag)

# start of the first nonsil segment
Expand All @@ -389,7 +429,24 @@ def ignore_short_seg(frame_tag, seg_len_thres):
else:
# no non-silence data, just let it pass
pass

elif opt_silence_handler == 2:
# if only consder silence in the front and end
tmp_nonzero = np.flatnonzero(frame_tag)

# start of the first nonsil segment
#start_nonsil = np.asarray(frame_tag == 1).nonzero()[0]
if np.any(tmp_nonzero):
start_nonsil = np.flatnonzero(frame_tag)[0]
# end of the last nonsil segment
end_nonsil = np.flatnonzero(frame_tag)[-1]
# all segments between are switched to nonsil
frame_tag[:start_nonsil] = 1
frame_tag[end_nonsil:] = 1
else:
# no non-silence data, just let it pass
pass
else:
pass


# separate non-speech and speech segments
Expand Down
2 changes: 2 additions & 0 deletions project/05-nn-vocoders/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ This project is Pytorch re-implementation of a few neural waveform models.

* Note that the tutorial **chapter_a3_pretrained_vocoders.ipynb** includes pre-trained HiFiGAN and WaveGlow on VoxCeleb2 dev and other speech datasets [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing).

* The code to extract the input Mel-spectrogram and F0 are included in the above tutorial and notebooks as well. This folder assumes that the input Mel-spectrogram and F0 have been prepared in advance.

**It is better to check the tutorials before diving into this project**.

## Quick start
Expand Down
3 changes: 2 additions & 1 deletion project/05-nn-vocoders/ilpcnet/block_lpcnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,8 @@ def _convert_pitch(self, pitch_value):
------
output: tensor in int64, quantized pitch
"""
return torch.clamp((pitch_value - 33) // 2, 0, 256).to(torch.int64)
return torch.clamp((pitch_value - 33) // 2, 0,
self.m_pitch_cat-1).to(torch.int64)


def forward(self, cond_feat, cond_feat_normed,
Expand Down
8 changes: 7 additions & 1 deletion sandbox/block_rawnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,12 +380,18 @@ def _compute_score(self, emb, inference=True):
Score here refers to
"""
# we should not use logsoftmax if we will use CrossEntropyLoss
flag_logsoftmax = False

if inference:
# no softmax
return self.m_output(emb)
else:
elif flag_logsoftmax:
# Logsoftmax for training loss
# this is used when the training criterion is NLLoss
return self.logsoftmax(self.m_output(emb))
else:
return self.m_output(emb)

def forward(self, x):
"""
Expand Down
78 changes: 78 additions & 0 deletions sandbox/eval_asvspoof.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,84 @@ def tDCF_wrapper(bonafide_cm_scores, spoof_cm_scores,
return min_tDCF, eer_cm, eer_threshold


def tDCF_wrapper2(bonafide_score_cm, spoof_score_cm, C0, C1, C2):
""" mintDCF, eer = tDCF_wrapper2(bonafide_score_cm,
spoof_score_cm, C0, C1, C2)
compute_tDCF can be factorized into two parts:
C012 computation and min t-DCF computation.
This is for min t-DCF computation, given the values of C012
input
-----
bonafide_score_cm np.array, score of bonafide data
spoof_score_cm np.array, score of spoofed data
C0 scalar, coefficient for min tDCF computation
C1 scalar, coefficient for min tDCF computation
C2 scalar, coefficient for min tDCF computation
output
------
eer scalar, value of EER
mintDCF scalar, value of min tDCF
For C0, C1, C2, see Appendix Eqs.(1-2) in evaluation plan [1],
or Eqs.(10-11) in [2]
References:
[1] T. Kinnunen, H. Delgado, N. Evans,K.-A. Lee, V. Vestman,
A. Nautsch, M. Todisco, X. Wang, M. Sahidullah, J. Yamagishi,
and D.-A. Reynolds, "Tandem Assessment of Spoofing Countermeasures
and Automatic Speaker Verification: Fundamentals," IEEE/ACM Transaction on
Audio, Speech and Language Processing (TASLP).
[2] ASVspoof 2019 challenge evaluation plan
https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf
"""
# Sanity check of scores
combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
sys.exit('ERROR: Your scores contain nan or inf.')

# Sanity check that inputs are scores and not decisions
n_uniq = np.unique(combined_scores).size
if n_uniq < 3:
sys.exit('ERROR: You should provide soft CM scores - not binary decisions')

# Obtain miss and false alarm rates of CM
Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(
bonafide_score_cm, spoof_score_cm)

# =====
# tDCF
# =====
if np.isnan(C0) or np.isnan(C1) or np.isnan(C2):
# this is a case where
mintDCF = np.nan
else:
# tDCF values
tDCF = C0 + C1 * Pmiss_cm + C2 * Pfa_cm
# Obtain default t-DCF
tDCF_default = C0 + np.minimum(C1, C2)
# Normalized t-DCF
tDCF_norm = tDCF / tDCF_default
# min t-DCF
mintDCF = tDCF_norm[tDCF_norm.argmin()]

# ====
# EER
# ====
abs_diffs = np.abs(Pmiss_cm - Pfa_cm)
min_index = np.argmin(abs_diffs)
eer = np.mean((Pmiss_cm[min_index], Pfa_cm[min_index]))

return mintDCF, eer



def ASVspoof2019_evaluate(bonafide_cm_scores, bonafide_cm_file_names,
spoof_cm_scores, spoof_cm_file_names, verbose=False,
protocol_alternative=None):
Expand Down
2 changes: 1 addition & 1 deletion sandbox/util_loss_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ def rank_consistency_v3(x, metric = None):
>> rank_consistency_v3(x, metric)
tensor(.0)
"""

# batch size
bs = x.shape[0]

# loss to be accumulated
Expand Down

0 comments on commit 1e68bf5

Please sign in to comment.