diff --git a/.gitignore b/.gitignore index 5764bfe22c6..9f219d458a4 100644 --- a/.gitignore +++ b/.gitignore @@ -77,6 +77,8 @@ GSYMS /egs/*/*/plp /egs/*/*/exp /egs/*/*/data +/egs/*/*/wav +/egs/*/*/enhan # /tools/ /tools/pocolm/ diff --git a/egs/chime5/s5b/local/nnet3/compare_wer.sh b/egs/chime5/s5b/local/nnet3/compare_wer.sh old mode 100755 new mode 100644 index 095e85cc338..fa627acd27b --- a/egs/chime5/s5b/local/nnet3/compare_wer.sh +++ b/egs/chime5/s5b/local/nnet3/compare_wer.sh @@ -130,3 +130,4 @@ done echo echo + diff --git a/egs/chime5/s5b/local/nnet3/decode.sh b/egs/chime5/s5b/local/nnet3/decode.sh index 7af09f36a13..8fa54e0d4a6 100755 --- a/egs/chime5/s5b/local/nnet3/decode.sh +++ b/egs/chime5/s5b/local/nnet3/decode.sh @@ -35,6 +35,8 @@ post_decode_acwt=1.0 # important to change this when using chain models extra_left_context_initial=0 extra_right_context_final=0 +graph_affix= + score_opts="--min-lmwt 6 --max-lmwt 13" . ./cmd.sh @@ -94,7 +96,7 @@ if [ $stage -le 2 ]; then fi fi -decode_dir=$dir/decode_${data_set}${affix} +decode_dir=$dir/decode${graph_affix}_${data_set}${affix} # generate the lattices if [ $stage -le 3 ]; then echo "Generating lattices, stage 1" diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh index 5c74c9ff242..989a5f95d01 100755 --- a/egs/chime5/s5b/local/run_recog.sh +++ b/egs/chime5/s5b/local/run_recog.sh @@ -28,8 +28,8 @@ json_dir=${chime5_corpus}/transcriptions audio_dir=${chime5_corpus}/audio # training and test data -train_set=train_worn_u100k -test_sets="eval_${enhancement}_ref" +train_set=train_worn_simu_u400k +test_sets="eval_${enhancement}_dereverb_ref" # This script also needs the phonetisaurus g2p, srilm, beamformit ./local/check_tools.sh || exit 1 @@ -38,18 +38,27 @@ if [ $stage -le 4 ]; then # Beamforming using reference arrays # enhanced WAV directory enhandir=enhan + dereverb_dir=${PWD}/wav/wpe/ for dset in eval; do for mictype in u01 u02 u03 u04 u05 u06; do - local/run_beamformit.sh --cmd "$train_cmd" \ + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \ ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ ${enhandir}/${dset}_${enhancement}_${mictype} \ ${mictype} done done - + for dset in eval; do local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ - ${json_dir}/${dset} data/${dset}_${enhancement}_ref + ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref done fi @@ -92,28 +101,13 @@ if [ $stage -le 7 ]; then done fi -if [ $stage -le 17 ]; then - nnet3_affix=_${train_set}_cleaned - for datadir in ${test_sets}; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires - done - for datadir in ${test_sets}; do - steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; - utils/fix_data_dir.sh data/${datadir}_hires || exit 1; - done - for data in $test_sets; do - steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \ - data/${data}_hires exp/nnet3${nnet3_affix}/extractor \ - exp/nnet3${nnet3_affix}/ivectors_${data}_hires - done -fi +nnet3_affix=_${train_set}_cleaned_rvb + +lm_suffix= if [ $stage -le 18 ]; then # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). - lm_suffix= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. @@ -138,16 +132,14 @@ if [ $stage -le 18 ]; then for data in $test_sets; do ( - steps/nnet3/decode.sh \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj 8 --cmd "$decode_cmd" --num-threads 4 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ - $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $decode_nj \ + --ivector-dir exp/nnet3${nnet3_affix} \ + --graph-affix ${lm_suffix} \ + data/${data} data/lang${lm_suffix} \ + $tree_dir/graph${lm_suffix} \ + exp/chain${nnet3_affix}/tdnn1b_sp ) || touch $dir/.error & done wait @@ -159,6 +151,6 @@ if [ $stage -le 20 ]; then # please specify both dev and eval set directories so that the search parameters # (insertion penalty and language model weight) will be tuned using the dev set local/score_for_submit.sh \ - --dev exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_dev_${enhancement}_ref \ - --eval exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_eval_${enhancement}_ref + --dev exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_dev_${enhancement}_dereverb_ref_2stage \ + --eval exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_eval_${enhancement}_dereverb_ref_2stage fi diff --git a/egs/chime5/s5b/local/run_wpe.sh b/egs/chime5/s5b/local/run_wpe.sh index 1c4b1c80291..ed512e69aae 100755 --- a/egs/chime5/s5b/local/run_wpe.sh +++ b/egs/chime5/s5b/local/run_wpe.sh @@ -33,7 +33,8 @@ set -o pipefail miniconda_dir=$HOME/miniconda3/ if [ ! -d $miniconda_dir ]; then - echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';" + echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'." + exit 1 fi # check if WPE is installed diff --git a/egs/chime6/README.txt b/egs/chime6/README.txt new file mode 100644 index 00000000000..9fb48c26822 --- /dev/null +++ b/egs/chime6/README.txt @@ -0,0 +1,6 @@ +This is a kaldi recipe for the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6). + +See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information. + +s5_track1 : Track 1 of the challenge (oracle segments and speaker label is provided) +s5_track2 : Track 2 of the challenge (only raw audio is provided) diff --git a/egs/chime6/s5_track1/RESULTS b/egs/chime6/s5_track1/RESULTS new file mode 100644 index 00000000000..73b47ddf3cc --- /dev/null +++ b/egs/chime6/s5_track1/RESULTS @@ -0,0 +1,21 @@ + +# tri2 +%WER 88.52 [ 52121 / 58881, 2023 ins, 30285 del, 19813 sub ] exp/tri2/decode_dev_gss/wer_17_0.5 + +# tri3 +%WER 85.72 [ 50471 / 58881, 3079 ins, 23787 del, 23605 sub ] exp/tri3/decode_dev_gss/wer_17_0.5 + +# nnet3 tdnn+chain +%WER 41.21 [ 24267 / 58881, 2428 ins, 7606 del, 14233 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_worn_2stage/wer_11_0.0 +%WER 51.76 [ 30474 / 58881, 2665 ins, 11749 del, 16060 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_gss_multiarray_2stage/wer_10_0.0 + +# result with the challenge submission format (Nov 17, 2019) +# after the fix of speaker ID across arrays +==== development set ==== +session S02 room DINING: #words 8288, #errors 4459, wer 53.80 % +session S02 room KITCHEN: #words 12696, #errors 7170, wer 56.47 % +session S02 room LIVING: #words 15460, #errors 7388, wer 47.78 % +session S09 room DINING: #words 5766, #errors 3100, wer 53.76 % +session S09 room KITCHEN: #words 8911, #errors 4483, wer 50.30 % +session S09 room LIVING: #words 7760, #errors 3874, wer 49.92 % +overall: #words 58881, #errors 30474, wer 51.75 % diff --git a/egs/chime6/s5_track1/cmd.sh b/egs/chime6/s5_track1/cmd.sh new file mode 100644 index 00000000000..9702501f1a7 --- /dev/null +++ b/egs/chime6/s5_track1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" + diff --git a/egs/chime6/s5_track1/conf/beamformit.cfg b/egs/chime6/s5_track1/conf/beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime6/s5_track1/conf/beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime6/s5_track1/conf/mfcc.conf b/egs/chime6/s5_track1/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/chime6/s5_track1/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/chime6/s5_track1/conf/mfcc_hires.conf b/egs/chime6/s5_track1/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/chime6/s5_track1/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/chime6/s5_track1/conf/online_cmvn.conf b/egs/chime6/s5_track1/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/chime6/s5_track1/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/chime6/s5_track1/conf/queue.conf b/egs/chime6/s5_track1/conf/queue.conf new file mode 100644 index 00000000000..73103195684 --- /dev/null +++ b/egs/chime6/s5_track1/conf/queue.conf @@ -0,0 +1,10 @@ +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l hostname='!b19*' +option gpu=* -l gpu=$0 -q g.q -l hostname='!b19*' + diff --git a/egs/chime6/s5_track1/local/add_location_to_uttid.sh b/egs/chime6/s5_track1/local/add_location_to_uttid.sh new file mode 100755 index 00000000000..91bd0c0dd37 --- /dev/null +++ b/egs/chime6/s5_track1/local/add_location_to_uttid.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Author: Ashish Arora +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +enhancement=gss +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/add_location_to_uttid.sh [options] " + echo " " + echo "main options (for others, see top of script file)" + echo " --enhancement # enhancement type (gss or beamformit)" + exit 1; +fi + +jdir=$1 +puttdir=$2 +utt_loc_file=$3 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +if [[ ${enhancement} == *gss* ]]; then + local/get_location.py $jdir > $utt_loc_file + local/replace_uttid.py $utt_loc_file $puttdir/per_utt > $puttdir/per_utt_loc +fi + +if [[ ${enhancement} == *beamformit* ]]; then + cat $puttdir/per_utt > $puttdir/per_utt_loc +fi diff --git a/egs/chime6/s5_track1/local/chain/compare_wer.sh b/egs/chime6/s5_track1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..cd6be14ed88 --- /dev/null +++ b/egs/chime6/s5_track1/local/chain/compare_wer.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/chime6/s5_track1/local/chain/run_tdnn.sh b/egs/chime6/s5_track1/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/chime6/s5_track1/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..daad37e2cd7 --- /dev/null +++ b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,270 @@ +#!/bin/bash + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=96 +train_set=train_worn_u100k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nnet3_affix=_train_worn_u100k +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.01 bottleneck-dim=320" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=512 + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=512 + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=512 + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2776 combine=-0.134->-0.133 (over 3) xent:train/valid[285,428,final]=(-2.37,-1.95,-1.95/-2.19,-1.90,-1.91) logprob:train/valid[285,428,final]=(-0.201,-0.125,-0.124/-0.198,-0.147,-0.148) + +set -e + +# configs for 'chain' +stage=0 +nj=96 +train_set=train_worn_u400k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nnet3_affix=_train_worn_u400k +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_epochs=4 +common_egs_dir= +# training options +# training chunk-options +chunk_width=140,100,160 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. +skip_decoding=true +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \ + ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $lat_dir $tree_dir +fi + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule "$dropout_schedule" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ] && [[ $skip_decoding == "false" ]]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +exit 0; diff --git a/egs/chime6/s5_track1/local/check_tools.sh b/egs/chime6/s5_track1/local/check_tools.sh new file mode 100755 index 00000000000..8e80e25ca33 --- /dev/null +++ b/egs/chime6/s5_track1/local/check_tools.sh @@ -0,0 +1,76 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh + +command -v uconv &>/dev/null \ + || { echo >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; } + +command -v ngram &>/dev/null \ + || { echo >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; } + +if [ -z ${LIBLBFGS} ]; then + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; +fi + +sox=`command -v sox 2>/dev/null` \ + || { echo >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; } + +# If sox is found on path, check if the version is correct +if [ ! -z "$sox" ]; then + sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'` + if [[ ! $sox_version =~ v14.4.* ]]; then + echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher." + exit 1 + fi +fi + +command -v phonetisaurus-align &>/dev/null \ + || { echo >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; } + +command -v BeamformIt &>/dev/null \ + || { echo >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; } + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'" +fi + +# check if WPE is installed +result=`$miniconda_dir/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" != "1" ]; then + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +# this is used for the audio synchronization +sox_conda=`command -v ${miniconda_dir}/bin/sox 2>/dev/null` +if [ -z "${sox_conda}" ]; then + echo "install conda sox (v14.4.2)" + ${miniconda_dir}/bin/conda install -c conda-forge sox +fi + +exit 0 diff --git a/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh b/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh new file mode 100755 index 00000000000..82839604c9e --- /dev/null +++ b/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +cmd=queue.pl +nj=40 +stage=0 +speed_perturb=true + +. ./path.sh +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +utt_map=$1 +data=$2 +srcdir=$3 +dir=$4 + +mkdir -p $dir + +cp $srcdir/{phones.txt,tree,final.mdl} $dir || exit 1 +cp $srcdir/{final.alimdl,final.occs,splice_opts,cmvn_opts,delta_opts,final.mat,full.mat} 2>/dev/null || true + +nj_src=$(cat $srcdir/num_jobs) || exit 1 + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj_src $dir/log/copy_lats_orig.JOB.log \ + lattice-copy "ark:gunzip -c $srcdir/lat.JOB.gz |" \ + ark,scp:$dir/lat_orig.JOB.ark,$dir/lat_orig.JOB.scp || exit 1 +fi + +for n in $(seq $nj_src); do + cat $dir/lat_orig.$n.scp +done > $dir/lat_orig.scp || exit 1 + +if $speed_perturb; then + for s in 0.9 1.1; do + awk -v s=$s '{print "sp"s"-"$1" sp"s"-"$2}' $utt_map + done | cat - $utt_map | sort -k1,1 > $dir/utt_map + utt_map=$dir/utt_map +fi + +if [ $stage -le 2 ]; then + utils/filter_scp.pl -f 2 $dir/lat_orig.scp < $utt_map | \ + utils/apply_map.pl -f 2 $dir/lat_orig.scp > \ + $dir/lat.scp || exit 1 + + if [ ! -s $dir/lat.scp ]; then + echo "$0: $dir/lat.scp is empty. Something went wrong!" + exit 1 + fi +fi + +utils/split_data.sh $data $nj + +if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/copy_lats.JOB.log \ + lattice-copy "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/lat.scp |" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1 +fi + +echo $nj > $dir/num_jobs + +if [ -f $srcdir/ali.1.gz ]; then + if [ $stage -le 4 ]; then + $cmd JOB=1:$nj_src $dir/log/copy_ali_orig.JOB.log \ + copy-int-vector "ark:gunzip -c $srcdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_orig.JOB.ark,$dir/ali_orig.JOB.scp || exit 1 + fi + + for n in $(seq $nj_src); do + cat $dir/ali_orig.$n.scp + done > $dir/ali_orig.scp || exit 1 + + if [ $stage -le 5 ]; then + utils/filter_scp.pl -f 2 $dir/ali_orig.scp < $utt_map | \ + utils/apply_map.pl -f 2 $dir/ali_orig.scp > \ + $dir/ali.scp || exit 1 + + if [ ! -s $dir/ali.scp ]; then + echo "$0: $dir/ali.scp is empty. Something went wrong!" + exit 1 + fi + fi + + utils/split_data.sh $data $nj + + if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/copy_ali.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1 + fi +fi + +rm $dir/lat_orig.*.{ark,scp} $dir/ali_orig.*.{ark,scp} 2>/dev/null || true diff --git a/egs/chime6/s5_track1/local/decode.sh b/egs/chime6/s5_track1/local/decode.sh new file mode 100755 index 00000000000..b44716ba4ac --- /dev/null +++ b/egs/chime6/s5_track1/local/decode.sh @@ -0,0 +1,253 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# +# This is a subset of run.sh to only perform recognition experiments with evaluation data +# This script can be run from run.sh or standalone.  +# To run it standalone, you can download a pretrained chain ASR model using: +# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz +# and copy the contents of the {data/ exp/} directory to your {data/ exp/} + +# Begin configuration section. +decode_nj=20 +gss_nj=50 +stage=0 +enhancement=gss # for a new enhancement method, + # change this variable and stage 4 + +# training data +train_set=train_worn_simu_u400k +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +enhanced_dir=enhanced +if [[ ${enhancement} == *gss* ]]; then + enhanced_dir=${enhanced_dir}_multiarray + enhancement=${enhancement}_multiarray +fi + +if [[ ${enhancement} == *beamformit* ]]; then + enhanced_dir=${enhanced_dir} + enhancement=${enhancement} +fi + +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1 +test_sets="dev_${enhancement} eval_${enhancement}" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +######################################################################################### +# In stage 1, we perform GSS based enhancement or beamformit for the test sets. multiarray = true +#can take around 10hrs for dev and eval set. +######################################################################################### + +if [ $stage -le 1 ] && [[ ${enhancement} == *gss* ]]; then + echo "$0: enhance data..." + # Guided Source Separation (GSS) from Paderborn University + # http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_boeddecker.pdf + # @Article{PB2018CHiME5, + # author = {Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}, + # title = {{Front-End Processing for the CHiME-5 Dinner Party Scenario}}, + # year = {2018}, + # booktitle = {CHiME5 Workshop}, + # } + + if [ ! -d pb_chime5/ ]; then + local/install_pb_chime5.sh + fi + + if [ ! -f pb_chime5/cache/chime6.json ]; then + ( + cd pb_chime5 + miniconda_dir=$HOME/miniconda3/ + export PATH=$miniconda_dir/bin:$PATH + export CHIME6_DIR=$chime6_corpus + make cache/chime6.json + ) + fi + + for dset in dev eval; do + local/run_gss.sh \ + --cmd "$train_cmd --max-jobs-run $gss_nj" --nj 160 \ + ${dset} \ + ${enhanced_dir} \ + ${enhanced_dir} || exit 1 + done + + for dset in dev eval; do + local/prepare_data.sh --mictype gss ${enhanced_dir}/audio/${dset} \ + ${json_dir}/${dset} data/${dset}_${enhancement} || exit 1 + done +fi + +####################################################################### +# Prepare the dev and eval data with dereverberation (WPE) and +# beamforming. +####################################################################### + +if [ $stage -le 1 ] && [[ ${enhancement} == *beamformit* ]]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhanced_dir=enhan + dereverb_dir=${PWD}/wav/wpe/ + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhanced_dir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + for dset in dev eval; do + local/prepare_data.sh --mictype ref "$PWD/${enhanced_dir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement} + done +fi + +# In GSS enhancement, we do not have array information in utterance ID +if [ $stage -le 2 ] && [[ ${enhancement} == *gss* ]]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_orig + done + + for dset in ${test_sets}; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_orig data/${dset} + done +fi + +if [ $stage -le 2 ] && [[ ${enhancement} == *beamformit* ]]; then + # fix speaker ID issue (thanks to Dr. Naoyuki Kanda) + # add array ID to the speaker ID to avoid the use of other array information to meet regulations + # Before this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01 + # After this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02 + echo "$0: fix data..." + for dset in ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + mkdir -p data/${dset}_nosplit_fix + for f in segments text wav.scp; do + if [ -f data/${dset}_nosplit/$f ]; then + cp data/${dset}_nosplit/$f data/${dset}_nosplit_fix + fi + done + awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk + utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt + done + + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${test_sets}; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset} + done +fi + +########################################################################## +# DECODING: we perform 2 stage decoding. +########################################################################## + +nnet3_affix=_${train_set}_cleaned_rvb +lm_suffix= + +if [ $stage -le 3 ]; then + # First the options that are passed through to run_ivector_common.sh + # (some of which are also used in this script directly). + + # The rest are configs specific to this script. Most of the parameters + # are just hardcoded at this level, in the commands below. + echo "$0: decode data..." + affix=1b # affix for the TDNN directory name + tree_affix= + tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} + dir=exp/chain${nnet3_affix}/tdnn${affix}_sp + + # training options + # training chunk-options + chunk_width=140,100,160 + # we don't need extra left/right context for TDNN systems. + chunk_left_context=0 + chunk_right_context=0 + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; + + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $decode_nj \ + --ivector-dir exp/nnet3${nnet3_affix} \ + data/${data} data/lang${lm_suffix} \ + $tree_dir/graph${lm_suffix} \ + exp/chain${nnet3_affix}/tdnn${affix}_sp + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +########################################################################## +# Scoring: here we obtain wer per session per location and overall WER +########################################################################## + +if [ $stage -le 4 ]; then + # final scoring to get the official challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh --enhancement $enhancement --json $json_dir \ + --dev exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_dev_${enhancement}_2stage \ + --eval exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_eval_${enhancement}_2stage +fi diff --git a/egs/chime6/s5_track1/local/distant_audio_list b/egs/chime6/s5_track1/local/distant_audio_list new file mode 100644 index 00000000000..710945b014b --- /dev/null +++ b/egs/chime6/s5_track1/local/distant_audio_list @@ -0,0 +1,372 @@ +S03_U01.CH1 +S03_U01.CH2 +S03_U01.CH3 +S03_U01.CH4 +S03_U02.CH1 +S03_U02.CH2 +S03_U02.CH3 +S03_U02.CH4 +S03_U03.CH1 +S03_U03.CH2 +S03_U03.CH3 +S03_U03.CH4 +S03_U04.CH1 +S03_U04.CH2 +S03_U04.CH3 +S03_U04.CH4 +S03_U05.CH1 +S03_U05.CH2 +S03_U05.CH3 +S03_U05.CH4 +S03_U06.CH1 +S03_U06.CH2 +S03_U06.CH3 +S03_U06.CH4 +S04_U01.CH1 +S04_U01.CH2 +S04_U01.CH3 +S04_U01.CH4 +S04_U02.CH1 +S04_U02.CH2 +S04_U02.CH3 +S04_U02.CH4 +S04_U03.CH1 +S04_U03.CH2 +S04_U03.CH3 +S04_U03.CH4 +S04_U04.CH1 +S04_U04.CH2 +S04_U04.CH3 +S04_U04.CH4 +S04_U05.CH1 +S04_U05.CH2 +S04_U05.CH3 +S04_U05.CH4 +S04_U06.CH1 +S04_U06.CH2 +S04_U06.CH3 +S04_U06.CH4 +S05_U01.CH1 +S05_U01.CH2 +S05_U01.CH3 +S05_U01.CH4 +S05_U02.CH1 +S05_U02.CH2 +S05_U02.CH3 +S05_U02.CH4 +S05_U05.CH1 +S05_U05.CH2 +S05_U05.CH3 +S05_U05.CH4 +S05_U06.CH1 +S05_U06.CH2 +S05_U06.CH3 +S05_U06.CH4 +S06_U01.CH1 +S06_U01.CH2 +S06_U01.CH3 +S06_U01.CH4 +S06_U02.CH1 +S06_U02.CH2 +S06_U02.CH3 +S06_U02.CH4 +S06_U03.CH1 +S06_U03.CH2 +S06_U03.CH3 +S06_U03.CH4 +S06_U04.CH1 +S06_U04.CH2 +S06_U04.CH3 +S06_U04.CH4 +S06_U05.CH1 +S06_U05.CH2 +S06_U05.CH3 +S06_U05.CH4 +S06_U06.CH1 +S06_U06.CH2 +S06_U06.CH3 +S06_U06.CH4 +S07_U01.CH1 +S07_U01.CH2 +S07_U01.CH3 +S07_U01.CH4 +S07_U02.CH1 +S07_U02.CH2 +S07_U02.CH3 +S07_U02.CH4 +S07_U03.CH1 +S07_U03.CH2 +S07_U03.CH3 +S07_U03.CH4 +S07_U04.CH1 +S07_U04.CH2 +S07_U04.CH3 +S07_U04.CH4 +S07_U05.CH1 +S07_U05.CH2 +S07_U05.CH3 +S07_U05.CH4 +S07_U06.CH1 +S07_U06.CH2 +S07_U06.CH3 +S07_U06.CH4 +S08_U01.CH1 +S08_U01.CH2 +S08_U01.CH3 +S08_U01.CH4 +S08_U02.CH1 +S08_U02.CH2 +S08_U02.CH3 +S08_U02.CH4 +S08_U03.CH1 +S08_U03.CH2 +S08_U03.CH3 +S08_U03.CH4 +S08_U04.CH1 +S08_U04.CH2 +S08_U04.CH3 +S08_U04.CH4 +S08_U05.CH1 +S08_U05.CH2 +S08_U05.CH3 +S08_U05.CH4 +S08_U06.CH1 +S08_U06.CH2 +S08_U06.CH3 +S08_U06.CH4 +S12_U01.CH1 +S12_U01.CH2 +S12_U01.CH3 +S12_U01.CH4 +S12_U02.CH1 +S12_U02.CH2 +S12_U02.CH3 +S12_U02.CH4 +S12_U03.CH1 +S12_U03.CH2 +S12_U03.CH3 +S12_U03.CH4 +S12_U04.CH1 +S12_U04.CH2 +S12_U04.CH3 +S12_U04.CH4 +S12_U05.CH1 +S12_U05.CH2 +S12_U05.CH3 +S12_U05.CH4 +S12_U06.CH1 +S12_U06.CH2 +S12_U06.CH3 +S12_U06.CH4 +S13_U01.CH1 +S13_U01.CH2 +S13_U01.CH3 +S13_U01.CH4 +S13_U02.CH1 +S13_U02.CH2 +S13_U02.CH3 +S13_U02.CH4 +S13_U03.CH1 +S13_U03.CH2 +S13_U03.CH3 +S13_U03.CH4 +S13_U04.CH1 +S13_U04.CH2 +S13_U04.CH3 +S13_U04.CH4 +S13_U05.CH1 +S13_U05.CH2 +S13_U05.CH3 +S13_U05.CH4 +S13_U06.CH1 +S13_U06.CH2 +S13_U06.CH3 +S13_U06.CH4 +S16_U01.CH1 +S16_U01.CH2 +S16_U01.CH3 +S16_U01.CH4 +S16_U02.CH1 +S16_U02.CH2 +S16_U02.CH3 +S16_U02.CH4 +S16_U03.CH1 +S16_U03.CH2 +S16_U03.CH3 +S16_U03.CH4 +S16_U04.CH1 +S16_U04.CH2 +S16_U04.CH3 +S16_U04.CH4 +S16_U05.CH1 +S16_U05.CH2 +S16_U05.CH3 +S16_U05.CH4 +S16_U06.CH1 +S16_U06.CH2 +S16_U06.CH3 +S16_U06.CH4 +S17_U01.CH1 +S17_U01.CH2 +S17_U01.CH3 +S17_U01.CH4 +S17_U02.CH1 +S17_U02.CH2 +S17_U02.CH3 +S17_U02.CH4 +S17_U03.CH1 +S17_U03.CH2 +S17_U03.CH3 +S17_U03.CH4 +S17_U04.CH1 +S17_U04.CH2 +S17_U04.CH3 +S17_U04.CH4 +S17_U05.CH1 +S17_U05.CH2 +S17_U05.CH3 +S17_U05.CH4 +S17_U06.CH1 +S17_U06.CH2 +S17_U06.CH3 +S17_U06.CH4 +S18_U01.CH1 +S18_U01.CH2 +S18_U01.CH3 +S18_U01.CH4 +S18_U02.CH1 +S18_U02.CH2 +S18_U02.CH3 +S18_U02.CH4 +S18_U03.CH1 +S18_U03.CH2 +S18_U03.CH3 +S18_U03.CH4 +S18_U04.CH1 +S18_U04.CH2 +S18_U04.CH3 +S18_U04.CH4 +S18_U05.CH1 +S18_U05.CH2 +S18_U05.CH3 +S18_U05.CH4 +S18_U06.CH1 +S18_U06.CH2 +S18_U06.CH3 +S18_U06.CH4 +S19_U01.CH1 +S19_U01.CH2 +S19_U01.CH3 +S19_U01.CH4 +S19_U02.CH1 +S19_U02.CH2 +S19_U02.CH3 +S19_U02.CH4 +S19_U03.CH1 +S19_U03.CH2 +S19_U03.CH3 +S19_U03.CH4 +S19_U04.CH1 +S19_U04.CH2 +S19_U04.CH3 +S19_U04.CH4 +S19_U05.CH1 +S19_U05.CH2 +S19_U05.CH3 +S19_U05.CH4 +S19_U06.CH1 +S19_U06.CH2 +S19_U06.CH3 +S19_U06.CH4 +S20_U01.CH1 +S20_U01.CH2 +S20_U01.CH3 +S20_U01.CH4 +S20_U02.CH1 +S20_U02.CH2 +S20_U02.CH3 +S20_U02.CH4 +S20_U03.CH1 +S20_U03.CH2 +S20_U03.CH3 +S20_U03.CH4 +S20_U04.CH1 +S20_U04.CH2 +S20_U04.CH3 +S20_U04.CH4 +S20_U05.CH1 +S20_U05.CH2 +S20_U05.CH3 +S20_U05.CH4 +S20_U06.CH1 +S20_U06.CH2 +S20_U06.CH3 +S20_U06.CH4 +S22_U01.CH1 +S22_U01.CH2 +S22_U01.CH3 +S22_U01.CH4 +S22_U02.CH1 +S22_U02.CH2 +S22_U02.CH3 +S22_U02.CH4 +S22_U04.CH1 +S22_U04.CH2 +S22_U04.CH3 +S22_U04.CH4 +S22_U05.CH1 +S22_U05.CH2 +S22_U05.CH3 +S22_U05.CH4 +S22_U06.CH1 +S22_U06.CH2 +S22_U06.CH3 +S22_U06.CH4 +S23_U01.CH1 +S23_U01.CH2 +S23_U01.CH3 +S23_U01.CH4 +S23_U02.CH1 +S23_U02.CH2 +S23_U02.CH3 +S23_U02.CH4 +S23_U03.CH1 +S23_U03.CH2 +S23_U03.CH3 +S23_U03.CH4 +S23_U04.CH1 +S23_U04.CH2 +S23_U04.CH3 +S23_U04.CH4 +S23_U05.CH1 +S23_U05.CH2 +S23_U05.CH3 +S23_U05.CH4 +S23_U06.CH1 +S23_U06.CH2 +S23_U06.CH3 +S23_U06.CH4 +S24_U01.CH1 +S24_U01.CH2 +S24_U01.CH3 +S24_U01.CH4 +S24_U02.CH1 +S24_U02.CH2 +S24_U02.CH3 +S24_U02.CH4 +S24_U03.CH1 +S24_U03.CH2 +S24_U03.CH3 +S24_U03.CH4 +S24_U04.CH1 +S24_U04.CH2 +S24_U04.CH3 +S24_U04.CH4 +S24_U05.CH1 +S24_U05.CH2 +S24_U05.CH3 +S24_U05.CH4 +S24_U06.CH1 +S24_U06.CH2 +S24_U06.CH3 +S24_U06.CH4 diff --git a/egs/chime6/s5_track1/local/extract_noises.py b/egs/chime6/s5_track1/local/extract_noises.py new file mode 100755 index 00000000000..8f617752f2d --- /dev/null +++ b/egs/chime6/s5_track1/local/extract_noises.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import argparse +import json +import logging +import os +import sys +import scipy.io.wavfile as siw +import math +import numpy as np + + +def get_args(): + parser = argparse.ArgumentParser( + """Extract noises from the corpus based on the non-speech regions. + e.g. {} /export/corpora4/CHiME5/audio/train/ \\ + /export/corpora4/CHiME5/transcriptions/train/ \\ + /export/b05/zhiqiw/noise/""".format(sys.argv[0])) + + parser.add_argument("--segment-length", default=20) + parser.add_argument("audio_dir", help="""Location of the CHiME5 Audio files. e.g. /export/corpora4/CHiME5/audio/train/""") + parser.add_argument("trans_dir", help="""Location of the CHiME5 Transcriptions. e.g. /export/corpora4/CHiME5/transcriptions/train/""") + parser.add_argument("audio_list", help="""List of ids of the CHiME5 recordings from which noise is extracted. e.g. local/distant_audio_list""") + parser.add_argument("out_dir", help="Output directory to write noise files. e.g. /export/b05/zhiqiw/noise/") + + args = parser.parse_args() + return args + + +def Trans_time(time, fs): + units = time.split(':') + time_second = float(units[0]) * 3600 + float(units[1]) * 60 + float(units[2]) + return int(time_second*fs) + + +# remove mic dependency for CHiME-6 +def Get_time(conf, tag, fs): + for i in conf: + st = Trans_time(i['start_time'], fs) + ed = Trans_time(i['end_time'], fs) + tag[st:ed] = 0 + return tag + + +def write_noise(out_dir, seg, audio, sig, tag, fs, cnt): + sig_noise = sig[np.nonzero(tag)] + for i in range(math.floor(len(sig_noise)/(seg*fs))): + siw.write(out_dir +'/noise'+str(cnt)+'.wav', fs, sig_noise[i*seg*fs:(i+1)*seg*fs]) + cnt += 1 + return cnt + + +def main(): + args = get_args() + + if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + + wav_list = open(args.audio_list).readlines() + + cnt = 1 + for i, audio in enumerate(wav_list): + parts = audio.strip().split('.') + if len(parts) == 2: + # Assuming distant mic with name like S03_U01.CH1 + session, mic = parts[0].split('_') + channel = parts[1] + base_name = session + "_" + mic + "." + channel + else: + # Assuming close talk mic with name like S03_P09 + session, mic = audio.strip().split('_') + base_name = session + "_" + mic + fs, sig = siw.read(args.audio_dir + "/" + base_name + '.wav') + tag = np.ones(len(sig)) + if i == 0 or session != session_p: + with open(args.trans_dir + "/" + session + '.json') as f: + conf = json.load(f) + tag = Get_time(conf, tag, fs) + cnt = write_noise(args.out_dir, args.segment_length, audio, sig, tag, fs, cnt) + session_p = session + + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track1/local/extract_vad_weights.sh b/egs/chime6/s5_track1/local/extract_vad_weights.sh new file mode 100755 index 00000000000..250b021bd8f --- /dev/null +++ b/egs/chime6/s5_track1/local/extract_vad_weights.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script converts lattices available from a first pass decode into a per-frame weights file +# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001) +# and voiced frames have a weight of 1. + +set -e + +stage=1 +cmd=run.pl +silence_weight=0.00001 +#end configuration section. + +. ./cmd.sh + +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; +if [ $# -ne 4 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + exit 1; +fi + +data_dir=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +decode_dir=$3 +output_wts_file_gz=$4 + +if [ $stage -le 1 ]; then + echo "$0: generating CTM from input lattices" + steps/get_ctm_conf.sh --cmd "$cmd" \ + --use-segments false \ + $data_dir \ + $lang \ + $decode_dir +fi + +if [ $stage -le 2 ]; then + name=`basename $data_dir` + # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot + ctm=$decode_dir/score_10/$name.ctm + echo "$0: generating weights file from ctm $ctm" + + pad_frames=0 # this did not seem to be helpful but leaving it as an option. + feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths + if [ ! -f $ctm ]; then echo "$0: expected ctm to exist: $ctm"; exit 1; fi + + cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \ + grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \ + grep -v -F '[laughter]' | grep -v -F '' | \ + perl -e ' $lengths=shift @ARGV; $pad_frames=shift @ARGV; $silence_weight=shift @ARGV; + $pad_frames >= 0 || die "bad pad-frames value $pad_frames"; + open(L, "<$lengths") || die "opening lengths file"; + @all_utts = (); + $utt2ref = { }; + while () { + ($utt, $len) = split(" ", $_); + push @all_utts, $utt; + $array_ref = [ ]; + for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; } + $utt2ref{$utt} = $array_ref; + } + while () { + @A = split(" ", $_); + @A == 6 || die "bad ctm line $_"; + $utt = $A[0]; $beg = $A[2]; $len = $A[3]; + $beg_int = int($beg * 100) - $pad_frames; + $len_int = int($len * 100) + 2*$pad_frames; + $array_ref = $utt2ref{$utt}; + !defined $array_ref && die "No length info for utterance $utt"; + for ($t = $beg_int; $t < $beg_int + $len_int; $t++) { + if ($t >= 0 && $t < @$array_ref) { + ${$array_ref}[$t] = 1; + } + } + } + foreach $utt (@all_utts) { $array_ref = $utt2ref{$utt}; + print $utt, " [ ", join(" ", @$array_ref), " ]\n"; + } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \ + gzip -c > $output_wts_file_gz +fi diff --git a/egs/chime6/s5_track1/local/generate_chime6_data.sh b/egs/chime6/s5_track1/local/generate_chime6_data.sh new file mode 100755 index 00000000000..93106cf605a --- /dev/null +++ b/egs/chime6/s5_track1/local/generate_chime6_data.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# Copyright 2019, Johns Hopkins University (Author: Shinji Watanabe) +# Apache 2.0 +# +# This script generates synchronized audio data across arrays by considering +# the frame dropping, clock drift etc. done by Prof. Jon Barker at University of +# Sheffield. This script first downloads the synchronization tool and generate +# the synchronized audios and corresponding JSON transcription files +# Note that +# 1) the JSON format is slightly changed from the original CHiME-5 one (simplified +# thanks to the synchronization) +# 2) it requires sox v.14.4.2 and Python 3.6.7 +# Unfortunately, the generated files would be different depending on the sox +# and Python versions and to generate the exactly same audio files, this script uses +# the fixed versions of sox and Python installed in the miniconda instead of system ones + +. ./cmd.sh +. ./path.sh + +# Config: +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "Wrong #arguments ($#, expected 2)" + echo "Usage: local/generate_chime6_data.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +sdir=$1 +odir=$2 +expdir=${PWD}/exp/chime6_data + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +# get chime6-synchronisation tools +SYNC_PATH=${PWD}/chime6-synchronisation +if [ ! -d ${SYNC_PATH} ]; then + git clone https://github.com/chimechallenge/chime6-synchronisation.git +fi + +mkdir -p ${odir} +mkdir -p ${expdir}/log + +# split the session to avoid too much disk access +sessions1="S01 S02 S03 S04 S05 S06 S07" +sessions2="S08 S09 S12 S13 S16 S17 S18" +sessions3="S19 S20 S21 S22 S23 S24" + +CONDA_PATH=${HOME}/miniconda3/bin +IN_PATH=${sdir}/audio +OUT_PATH=${odir}/audio +TMP_PATH=${odir}/audio_tmp + +if [ ! -d "${IN_PATH}" ]; then + echo "please specify the CHiME-5 data path correctly" + exit 1 +fi +mkdir -p $OUT_PATH/train $OUT_PATH/eval $OUT_PATH/dev +mkdir -p $TMP_PATH/train $TMP_PATH/eval $TMP_PATH/dev + +if [ -f ${odir}/audio/dev/S02_P05.wav ]; then + echo "CHiME-6 date already exists" + exit 0 +fi + +pushd ${SYNC_PATH} +echo "Correct for frame dropping" +for session in ${sessions1}; do + $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH & +done +wait +for session in ${sessions2}; do + $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH & +done +wait +for session in ${sessions3}; do + $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH & +done +wait + +echo "Sox processing for correcting clock drift" +for session in ${sessions1}; do + $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH & +done +wait +for session in ${sessions2}; do + $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH & +done +wait +for session in ${sessions3}; do + $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \ + ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH & +done +wait + +echo "adjust the JSON files" +mkdir -p ${odir}/transcriptions/eval ${odir}/transcriptions/dev ${odir}/transcriptions/train +${CONDA_PATH}/python correct_transcript_for_clock_drift.py --clock_drift_data chime6_audio_edits.json ${sdir}/transcriptions ${odir}/transcriptions +popd + +# finally check md5sum +pushd ${odir} +echo "check MD5 hash value for generated audios" +md5sum -c ${SYNC_PATH}/audio_md5sums.txt || echo "check https://github.com/chimechallenge/chime6-synchronisation" +popd + +echo "`basename $0` Done." diff --git a/egs/chime6/s5_track1/local/get_location.py b/egs/chime6/s5_track1/local/get_location.py new file mode 100755 index 00000000000..92351e72e65 --- /dev/null +++ b/egs/chime6/s5_track1/local/get_location.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# Copyright Ashish Arora +# Apache 2.0 +# This script create a utterance and location mapping file +# It is used in score_for_submit script to get locationwise WER. +# for GSS enhancement + +import json +from datetime import timedelta +from glob import glob +import sys, io +from decimal import Decimal + +SAMPLE_RATE = 16000 + +def to_samples(time: str): + "mapping time in string to int, as mapped in pb_chime5" + "see https://github.com/fgnt/pb_chime5/blob/master/pb_chime5/database/chime5/get_speaker_activity.py" + hours, minutes, seconds = [t for t in time.split(':')] + hours = int(hours) + minutes = int(minutes) + seconds = Decimal(seconds) + + seconds_samples = seconds * SAMPLE_RATE + samples = ( + hours * 3600 * SAMPLE_RATE + + minutes * 60 * SAMPLE_RATE + + seconds_samples + ) + return int(samples) + + +def main(): + output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + json_file_location= sys.argv[1] + '/*.json' + json_files = glob(json_file_location) + + json_file_location= sys.argv[1] + '/*.json' + json_files = glob(json_file_location) + location_dict = {} + json_file_location= sys.argv[1] + '/*.json' + json_files = glob(json_file_location) + location_dict = {} + for file in json_files: + with open(file, 'r') as f: + session_dict = json.load(f) + + for uttid in session_dict: + try: + ref=uttid['ref'] + speaker_id = uttid['speaker'] + location = uttid['location'] + location=location.upper() + session_id=uttid['session_id'] + words = uttid['words'] + end_sample=to_samples(str(uttid['end_time'])) + start_sample=to_samples(str(uttid['start_time'])) + start_sample_str = str(int(start_sample * 100 / SAMPLE_RATE)).zfill(7) + end_sample_str = str(int(end_sample * 100 / SAMPLE_RATE)).zfill(7) + utt = "{0}_{1}-{2}-{3}".format(speaker_id, session_id, start_sample_str, end_sample_str) + location_dict[utt]=(location) + except: + continue + + for key in sorted(location_dict.keys()): + utt= "{0} {1}".format(key, location_dict[key]) + output.write(utt+ '\n') + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track1/local/install_pb_chime5.sh b/egs/chime6/s5_track1/local/install_pb_chime5.sh new file mode 100755 index 00000000000..a151dc60f12 --- /dev/null +++ b/egs/chime6/s5_track1/local/install_pb_chime5.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Installs pb_chime5 +# miniconda should be installed in $HOME/miniconda3/ + +miniconda_dir=$HOME/miniconda3/ + +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run 'tools/extras/install_miniconda.sh" && exit 1; +fi + +git clone https://github.com/fgnt/pb_chime5.git +cd pb_chime5 +# Download submodule dependencies # https://stackoverflow.com/a/3796947/5766934 +git submodule init +git submodule update + +$miniconda_dir/bin/python -m pip install cython +$miniconda_dir/bin/python -m pip install pymongo +$miniconda_dir/bin/python -m pip install fire +$miniconda_dir/bin/python -m pip install -e pb_bss/ +$miniconda_dir/bin/python -m pip install -e . diff --git a/egs/chime6/s5_track1/local/json2text.py b/egs/chime6/s5_track1/local/json2text.py new file mode 100755 index 00000000000..34cf52f086b --- /dev/null +++ b/egs/chime6/s5_track1/local/json2text.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import json +import argparse +import logging +import sys + + +def hms_to_seconds(hms): + hour = hms.split(':')[0] + minute = hms.split(':')[1] + second = hms.split(':')[2].split('.')[0] + + # .xx (10 ms order) + ms10 = hms.split(':')[2].split('.')[1] + + # total seconds + seconds = int(hour) * 3600 + int(minute) * 60 + int(second) + + return '{:07d}'.format(int(str(seconds) + ms10)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('json', type=str, help='JSON transcription file') + parser.add_argument('--mictype', type=str, + choices=['ref', 'worn', 'gss', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'], + help='Type of microphones') + args = parser.parse_args() + + # logging info + log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s" + logging.basicConfig(level=logging.INFO, format=log_format) + + logging.debug("reading %s", args.json) + with open(args.json, 'rt', encoding="utf-8") as f: + j = json.load(f) + + for x in j: + if '[redacted]' not in x['words']: + session_id = x['session_id'] + speaker_id = x['speaker'] + if args.mictype == 'ref': + mictype = x['ref'] + elif args.mictype == 'worn' or args.mictype == 'gss': + mictype = 'original' + else: + mictype = args.mictype.upper() # convert from u01 to U01 + + # add location tag for scoring (only for dev and eval sets) + if 'location' in x.keys(): + location = x['location'].upper() + else: + location = 'NOLOCATION' + + # remove mic dependency for CHiME-6 + start_time = x['start_time'] + end_time = x['end_time'] + + # remove meta chars and convert to lower + words = x['words'].replace('"', '')\ + .replace('.', '')\ + .replace('?', '')\ + .replace(',', '')\ + .replace(':', '')\ + .replace(';', '')\ + .replace('!', '').lower() + + # remove multiple spaces + words = " ".join(words.split()) + + # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55 + start_time = hms_to_seconds(start_time) + end_time = hms_to_seconds(end_time) + + uttid = speaker_id + '_' + session_id + if not args.mictype in ['worn', 'gss']: + uttid += '_' + mictype + + if args.mictype == 'gss': + uttid += '-' + start_time + '-' + end_time + else: + uttid += '_' + location + '-' + start_time + '-' + end_time + + # In several utterances, there are inconsistency in the time stamp + # (the end time is earlier than the start time) + # We just ignored such utterances. + if end_time > start_time: + sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8")) diff --git a/egs/chime6/s5_track1/local/make_noise_list.py b/egs/chime6/s5_track1/local/make_noise_list.py new file mode 100755 index 00000000000..5aaf7fa4062 --- /dev/null +++ b/egs/chime6/s5_track1/local/make_noise_list.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import glob +import os +import sys + + +if len(sys.argv) != 2: + print ("Usage: {} ".format(sys.argv[0])) + raise SystemExit(1) + + +for line in glob.glob("{}/*.wav".format(sys.argv[1])): + fname = os.path.basename(line.strip()) + + print ("--noise-id {} --noise-type point-source " + "--bg-fg-type foreground {}".format(fname, line.strip())) diff --git a/egs/chime6/s5_track1/local/nnet3/compare_wer.sh b/egs/chime6/s5_track1/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..095e85cc338 --- /dev/null +++ b/egs/chime6/s5_track1/local/nnet3/compare_wer.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo diff --git a/egs/chime6/s5_track1/local/nnet3/decode.sh b/egs/chime6/s5_track1/local/nnet3/decode.sh new file mode 100755 index 00000000000..8fa54e0d4a6 --- /dev/null +++ b/egs/chime6/s5_track1/local/nnet3/decode.sh @@ -0,0 +1,164 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script does 2-stage decoding where the first stage is used to get +# reliable frames for i-vector extraction. + +set -e + +# general opts +iter= +stage=0 +nj=30 +affix= # affix for decode directory + +# ivector opts +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=6000 +ivector_scale=0.75 +get_weights_from_ctm=true +weights_file= # use weights from this archive (must be compressed using gunzip) +silence_weight=0.00001 # apply this weight to silence frames during i-vector extraction +ivector_dir=exp/nnet3 + +# decode opts +pass2_decode_opts="--min-active 1000" +lattice_beam=8 +extra_left_context=0 # change for (B)LSTM +extra_right_context=0 # change for BLSTM +frames_per_chunk=50 # change for (B)LSTM +acwt=0.1 # important to change this when using chain models +post_decode_acwt=1.0 # important to change this when using chain models +extra_left_context_initial=0 +extra_right_context_final=0 + +graph_affix= + +score_opts="--min-lmwt 6 --max-lmwt 13" + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1|2) # start scoring script from part-way through." + echo "e.g.:" + echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn" + exit 1; +fi + +data=$1 # data directory +lang=$2 # data/lang +graph=$3 #exp/tri5a/graph_pp +dir=$4 # exp/nnet3/tdnn + +model_affix=`basename $dir` +ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter} +affix=${affix:+_${affix}}${iter:+_iter${iter}} + +if [ $stage -le 1 ]; then + if [ ! -s ${data}_hires/feats.scp ]; then + utils/copy_data_dir.sh $data ${data}_hires + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires + steps/compute_cmvn_stats.sh ${data}_hires + utils/fix_data_dir.sh ${data}_hires + fi +fi + +data_set=$(basename $data) +if [ $stage -le 2 ]; then + echo "Extracting i-vectors, stage 1" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + --max-count $max_count \ + ${data}_hires $ivector_dir/extractor \ + $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1; + # float comparisons are hard in bash + if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then + ivector_scale_affix=_scale$ivector_scale + else + ivector_scale_affix= + fi + + if [ ! -z "$ivector_scale_affix" ]; then + echo "$0: Scaling iVectors, stage 1" + srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1 + outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 + mkdir -p $outdir + $train_cmd $outdir/log/scale_ivectors.log \ + copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \ + copy-feats --compress=true ark:- ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp; + cp $srcdir/ivector_period $outdir/ivector_period + fi +fi + +decode_dir=$dir/decode${graph_affix}_${data_set}${affix} +# generate the lattices +if [ $stage -le 3 ]; then + echo "Generating lattices, stage 1" + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \ + --skip-scoring true ${iter:+--iter $iter} \ + $graph ${data}_hires ${decode_dir}_stage1; +fi + +if [ $stage -le 4 ]; then + if $get_weights_from_ctm; then + if [ ! -z $weights_file ]; then + echo "$0: Using provided vad weights file $weights_file" + ivector_extractor_weights=$weights_file + else + echo "$0 : Generating vad weights file" + ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz + local/extract_vad_weights.sh --silence-weight $silence_weight \ + --cmd "$decode_cmd" ${iter:+--iter $iter} \ + ${data}_hires $lang \ + ${decode_dir}_stage1 $ivector_extractor_weights + fi + else + # get weights from best path decoding + ivector_extractor_weights=${decode_dir}_stage1 + fi +fi + +if [ $stage -le 5 ]; then + echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights" + # this does offline decoding, except we estimate the iVectors per + # speaker, excluding silence (based on alignments from a DNN decoding), with a + # different script. This is just to demonstrate that script. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \ + --silence-weight $silence_weight \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + ${data}_hires $lang $ivector_dir/extractor \ + $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix}; +fi + +if [ $stage -le 6 ]; then + echo "Generating lattices, stage 2 with --acwt $acwt" + rm -f ${decode_dir}/.error + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \ + $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error + [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1; +fi +exit 0 diff --git a/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh b/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..3910e1812a3 --- /dev/null +++ b/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train_worn_u100k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nj=96 + +nnet3_affix=_train_worn_u100k + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l &2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" + exit 1 +fi + +set -e -o pipefail + +adir=$(utils/make_absolute.sh $1) +jdir=$2 +dir=$3 + +json_count=$(find -L $jdir -name "*.json" | wc -l) +wav_count=$(find -L $adir -name "*.wav" | wc -l) + +if [ "$json_count" -eq 0 ]; then + echo >&2 "We expect that the directory $jdir will contain json files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi +if [ "$wav_count" -eq 0 ]; then + echo >&2 "We expect that the directory $adir will contain wav files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi + +echo "$0: Converting transcription to text" + +mkdir -p $dir + +for file in $jdir/*json; do + ./local/json2text.py --mictype $mictype $file +done | \ + sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ + sed -e 's/ - / /g' |\ + sed -e 's/mm-/mm/g' > $dir/text.orig + +echo "$0: Creating datadir $dir for type=\"$mictype\"" + +if [ $mictype == "worn" ]; then + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key, add .L and .R for left and right channel + # i.e. each file will have two entries (left and right channel) + find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + @F = split "_", $f; + print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n"; + }' | sort > $dir/wav.scp + + # generate the transcripts for both left and right channel + # from the original transcript in the form + # P09_S03-0006072-0006147 gimme the baker + # create left and right channel transcript + # P09_S03.L-0006072-0006147 gimme the baker + # P09_S03.R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text +elif [ $mictype == "ref" ]; then + # fixed reference array + + # first get a text, which will be used to extract reference arrays + perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text + + find -L $adir | grep "\.wav" | sort > $dir/wav.flist + # following command provide the argument for grep to extract only reference arrays + grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 + paste -d" " \ + <(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \ + $dir/wav.flist2 | sort > $dir/wav.scp +elif [ $mictype == "gss" ]; then + find -L $adir -name "P[0-9]*_S[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + print "$f $path\n"; + }' | sort > $dir/wav.scp + + cat $dir/text.orig | sort > $dir/text +else + # array mic case + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key + find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ + perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ + sort -u > $dir/wav.scp + + # convert the transcripts from + # P09_S03-0006072-0006147 gimme the baker + # to the per-channel transcripts + # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker + perl -ne '$l=$_; + for($i=1; $i<=4; $i++) { + ($x=$l)=~ s/-/.CH\Q$i\E-/; + print $x;}' $dir/text.orig | sort > $dir/text + +fi +$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist + +# Prepare 'segments', 'utt2spk', 'spk2utt' +if [ $mictype == "worn" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" \ + > $dir/segments +elif [ $mictype == "ref" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e "s/ P.._/ /" > $dir/segments +elif [ $mictype != "gss" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e 's/ P.._/ /' > $dir/segments +fi + +cut -f 1 -d ' ' $dir/text | \ + perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk + +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +# Check that data dirs are okay! +utils/validate_data_dir.sh --no-feats $dir || exit 1 diff --git a/egs/chime6/s5_track1/local/prepare_dict.sh b/egs/chime6/s5_track1/local/prepare_dict.sh new file mode 100755 index 00000000000..09083d0e795 --- /dev/null +++ b/egs/chime6/s5_track1/local/prepare_dict.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +. ./utils/parse_options.sh + +. ./path.sh + +set -e -o pipefail +set -o nounset # Treat unset variables as an error + + +# The parts of the output of this that will be needed are +# [in data/local/dict/ ] +# lexicon.txt +# extra_questions.txt +# nonsilence_phones.txt +# optional_silence.txt +# silence_phones.txt + + +# check existing directories +[ $# != 0 ] && echo "Usage: $0" && exit 1; + +dir=data/local/dict + +mkdir -p $dir +echo "$0: Getting CMU dictionary" +if [ ! -f $dir/cmudict.done ]; then + [ -d $dir/cmudict ] && rm -rf $dir/cmudict + svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict + touch $dir/cmudict.done +fi + +# silence phones, one per line. +for w in sil spn inaudible laughs noise; do + echo $w; +done > $dir/silence_phones.txt +echo sil > $dir/optional_silence.txt + +# For this setup we're discarding stress. +cat $dir/cmudict/cmudict-0.7b.symbols | \ + perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \ + sort -u > $dir/nonsilence_phones.txt + +# An extra question will be added by including the silence phones in one class. +paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt + +grep -v ';;;' $dir/cmudict/cmudict-0.7b |\ + uconv -f latin1 -t utf-8 -x Any-Lower |\ + perl -ne 's:(\S+)\(\d+\) :$1 :; s: : :; print;' |\ + perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \ + > $dir/lexicon1_raw_nosil.txt || exit 1; + +# Add prons for laughter, noise, oov +for w in `grep -v sil $dir/silence_phones.txt`; do + echo "[$w] $w" +done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; + +# we keep all words from the cmudict in the lexicon +# might reduce OOV rate on dev and eval +cat $dir/lexicon2_raw.txt \ + <( echo "mm m" + echo " spn" + echo "cuz k aa z" + echo "cuz k ah z" + echo "cuz k ao z" + echo "mmm m"; \ + echo "hmm hh m"; \ + ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt + + +cat data/train*/text | \ + awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ + sort -nr > $dir/word_counts + +cat $dir/word_counts | awk '{print $2}' > $dir/word_list + +awk '{print $1}' $dir/iv_lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.txt + +echo "*Highest-count OOVs (including fragments) are:" +head -n 10 $dir/oov_counts.txt +echo "*Highest-count OOVs (excluding fragments) are:" +grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true + +echo "*Training a G2P and generating missing pronunciations" +mkdir -p $dir/g2p/ +phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus +ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\ + -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \ + -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa +phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst +awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt +phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \ + --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt + +## The next section is again just for debug purposes +## to show words for which the G2P failed +cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt +rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists. +awk '{print $1}' $dir/lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.g2p.txt + +echo "*Highest-count OOVs (including fragments) after G2P are:" +head -n 10 $dir/oov_counts.g2p.txt + +utils/validate_dict_dir.pl $dir +exit 0; + diff --git a/egs/chime6/s5_track1/local/replace_uttid.py b/egs/chime6/s5_track1/local/replace_uttid.py new file mode 100755 index 00000000000..96c45b58783 --- /dev/null +++ b/egs/chime6/s5_track1/local/replace_uttid.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# Copyright Ashish Arora +# Apache 2.0 +# This script is used in score_for_submit. It adds locationid to the utteranceid, +# using uttid_location file, for locationwise scoring. + +import sys, io +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +def load_uttid_location(f): + locations = {} + for line in f: + parts=line.strip().split(' ') + uttid, loc = parts[0], parts[1] + locations[uttid] = loc + return locations + +locations = load_uttid_location(open(sys.argv[1],'r', encoding='utf8')) + +for line in open(sys.argv[2],'r', encoding='utf8'): + uttid, res = line.split(None, 1) + try: + location = locations[uttid] + location_uttid = location +'_'+ str(uttid) + output.write(location_uttid + ' ' + res) + except KeyError as e: + raise Exception("Could not find utteranceid in " + "uttid_location file" + "({0})\n".format(str(e))) diff --git a/egs/chime6/s5_track1/local/reverberate_lat_dir.sh b/egs/chime6/s5_track1/local/reverberate_lat_dir.sh new file mode 100755 index 00000000000..f601a37c0e1 --- /dev/null +++ b/egs/chime6/s5_track1/local/reverberate_lat_dir.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Copyright 2018 Vimal Manohar +# Apache 2.0 + +num_data_reps=1 +cmd=run.pl +nj=20 +include_clean=false + +. utils/parse_options.sh +. ./path.sh + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +train_data_dir=$1 +noisy_latdir=$2 +clean_latdir=$3 +dir=$4 + +clean_nj=$(cat $clean_latdir/num_jobs) + +$cmd JOB=1:$clean_nj $dir/copy_clean_lattices.JOB.log \ + lattice-copy "ark:gunzip -c $clean_latdir/lat.JOB.gz |" \ + ark,scp:$dir/lats_clean.JOB.ark,$dir/lats_clean.JOB.scp || exit 1 + +for n in $(seq $clean_nj); do + cat $dir/lats_clean.$n.scp +done > $dir/lats_clean.scp + +for i in $(seq $num_data_reps); do + cat $dir/lats_clean.scp | awk -vi=$i '{print "rev"i"_"$0}' +done > $dir/lats_rvb.scp + +noisy_nj=$(cat $noisy_latdir/num_jobs) +$cmd JOB=1:$noisy_nj $dir/copy_noisy_lattices.JOB>log \ + lattice-copy "ark:gunzip -c $noisy_latdir/lat.JOB.gz |" \ + ark,scp:$dir/lats_noisy.JOB.ark,$dir/lats_noisy.JOB.scp || exit 1 + +optional_clean= +if $include_clean; then + optional_clean=$dir/lats_clean.scp +fi + +for n in $(seq $noisy_nj); do + cat $dir/lats_noisy.$n.scp +done | cat - $dir/lats_rvb.scp ${optional_clean} | sort -k1,1 > $dir/lats.scp + +utils/split_data.sh $train_data_dir $nj +$cmd JOB=1:$nj $dir/copy_lattices.JOB.log \ + lattice-copy "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/lats.scp |" \ + "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1 + +echo $nj > $dir/num_jobs + +if [ -f $clean_latdir/ali.1.gz ]; then + $cmd JOB=1:$clean_nj $dir/copy_clean_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $clean_latdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_clean.JOB.ark,$dir/ali_clean.JOB.scp + + for n in $(seq $clean_nj); do + cat $dir/ali_clean.$n.scp + done > $dir/ali_clean.scp + + for i in $(seq $num_data_reps); do + cat $dir/ali_clean.scp | awk -vi=$i '{print "rev"i"_"$0}' + done > $dir/ali_rvb.scp + + optional_clean= + if $include_clean; then + optional_clean=$dir/ali_clean.scp + fi + + $cmd JOB=1:$noisy_nj $dir/copy_noisy_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $noisy_latdir/ali.JOB.gz |" \ + ark,scp:$dir/ali_noisy.JOB.ark,$dir/ali_noisy.JOB.scp + + for n in $(seq $noisy_nj); do + cat $dir/ali_noisy.$n.scp + done | cat - $dir/ali_rvb.scp $optional_clean | sort -k1,1 > $dir/ali.scp + + utils/split_data.sh $train_data_dir $nj || exit 1 + $cmd JOB=1:$nj $dir/copy_rvb_alignments.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1 +fi + +cp $clean_latdir/{final.*,tree,*.mat,*opts,*.txt} $dir || true + +rm $dir/lats_{clean,noisy}.*.{ark,scp} $dir/ali_{clean,noisy}.*.{ark,scp} || true # save space diff --git a/egs/chime6/s5_track1/local/run_beamformit.sh b/egs/chime6/s5_track1/local/run_beamformit.sh new file mode 100755 index 00000000000..aa3badd90d8 --- /dev/null +++ b/egs/chime6/s5_track1/local/run_beamformit.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +. ./path.sh + +# Config: +cmd=run.pl +bmf="1 2 3 4" + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_beamformit.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --bmf \"1 2 3 4\" # microphones used for beamforming" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'` + +if ! command -v BeamformIt &>/dev/null ; then + echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1 +fi + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $expdir/log + +echo "Will use the following channels: $bmf" +# number of channels +numch=`echo $bmf | tr ' ' '\n' | wc -l` +echo "the number of channels: $numch" + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list + +# this is an input file list of the microphones +# format: 1st_wav 2nd_wav ... nth_wav +input_arrays=$expdir/channels_$numch +for x in `cat $output_wavfiles`; do + echo -n "$x" + for ch in $bmf; do + echo -n " $x.CH$ch.wav" + done + echo "" +done > $input_arrays + +# split the list for parallel processing +# number of jobs are set by the number of WAV files +nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'` +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $expdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/beamformit.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/beamform.*.sh +$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \ + $expdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime6/s5_track1/local/run_gss.sh b/egs/chime6/s5_track1/local/run_gss.sh new file mode 100755 index 00000000000..fbdc4af25d1 --- /dev/null +++ b/egs/chime6/s5_track1/local/run_gss.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi + +# Config: +cmd=run.pl +nj=4 +multiarray=outer_array_mics +bss_iterations=5 +context_samples=160000 +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_gss.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --bss_iterations 5 # Number of EM iterations" + echo " --context_samples 160000 # Left-right context in number of samples" + echo " --multiarray # Multiarray configuration" + exit 1; +fi + +# setting multiarray as "true" uses all mics, we didn't see any performance +# gain from this we have chosen settings that makes the enhacement finish +# in around 1/3 of a day without significant change in performance. +# our result during the experiments are as follows: + +#MAF: multi array = False +#MAT: multi array = True +#Enhancement Iterations Num Microphones Context Computational time for GSS #cpus dev WER eval WER +#GSS(MAF) 10 24 17 hrs 30 62.3 57.98 +#GSS(MAT) 5 24 10s 26 hrs 50 53.15 53.77 +#GSS(MAT) 5 12 10s 9.5 hrs 50 53.09 53.75 + +session_id=$1 +log_dir=$2 +enhanced_dir=$3 +if [ ! -d pb_chime5/ ]; then + echo "Missing pb_chime5, run 'local/install_pb_chime5'" + exit 1 +fi + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir/ ]; then + echo "$miniconda_dir/ does not exist. Please run '../../../tools/extras/install_miniconda.sh'" + exit 1 +fi + +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || \ + { echo "Could not make absolute '$enhanced_dir'" && exit 1; } + +$cmd JOB=1:$nj $log_dir/log/enhance_${session_id}.JOB.log \ + cd pb_chime5/ '&&' \ + $miniconda_dir/bin/python -m pb_chime5.scripts.kaldi_run with \ + chime6=True \ + storage_dir=$enhanced_dir \ + session_id=$session_id \ + job_id=JOB number_of_jobs=$nj \ + bss_iterations=$bss_iterations \ + context_samples=$context_samples \ + multiarray=$multiarray || exit 1 diff --git a/egs/chime6/s5_track1/local/run_wpe.py b/egs/chime6/s5_track1/local/run_wpe.py new file mode 100755 index 00000000000..fbb264f2fd2 --- /dev/null +++ b/egs/chime6/s5_track1/local/run_wpe.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 +# Works with both python2 and python3 +# This script assumes that WPE (nara_wpe) is installed locally using miniconda. +# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh +# needs to be run and this script needs to be launched run with that version of +# python. +# See local/run_wpe.sh for example. + +import numpy as np +import soundfile as sf +import time +import os, errno +from tqdm import tqdm +import argparse + +# to avoid huge memory consumption we decided to use `wpe_v8` instead of the original wpe by +# following the advice from Christoph Boeddeker at Paderborn University +# https://github.com/chimechallenge/kaldi_chime6/commit/2ea6ac07ef66ad98602f073b24a233cb7f61605c#r36147334 +from nara_wpe.wpe import wpe_v8 as wpe +from nara_wpe.utils import stft, istft +from nara_wpe import project_root + +parser = argparse.ArgumentParser() +parser.add_argument('--files', '-f', nargs='+') +args = parser.parse_args() + +input_files = args.files[:len(args.files)//2] +output_files = args.files[len(args.files)//2:] +out_dir = os.path.dirname(output_files[0]) +try: + os.makedirs(out_dir) +except OSError as e: + if e.errno != errno.EEXIST: + raise + +stft_options = dict( + size=512, + shift=128, + window_length=None, + fading=True, + pad=True, + symmetric_window=False +) + +sampling_rate = 16000 +delay = 3 +iterations = 5 +taps = 10 + +signal_list = [ + sf.read(f)[0] + for f in input_files +] +y = np.stack(signal_list, axis=0) +Y = stft(y, **stft_options).transpose(2, 0, 1) +Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0) +z = istft(Z, size=stft_options['size'], shift=stft_options['shift']) + +for d in range(len(signal_list)): + sf.write(output_files[d], z[d,:], sampling_rate) diff --git a/egs/chime6/s5_track1/local/run_wpe.sh b/egs/chime6/s5_track1/local/run_wpe.sh new file mode 100755 index 00000000000..ed512e69aae --- /dev/null +++ b/egs/chime6/s5_track1/local/run_wpe.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +# Config: +nj=4 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_wpe.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --nj 50 # number of jobs for parallel processing" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +task=`basename $sdir` +expdir=exp/wpe/${task}_${array} +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'." + exit 1 +fi + +# check if WPE is installed +result=`$miniconda_dir/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "WPE is installed" +else + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +mkdir -p $odir +mkdir -p $expdir/log + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} > $expdir/channels_input +cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output +paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles + +# split the list for parallel processing +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Dereverberation - $task - $array\n" +# making a shell script for each job +for n in `seq $nj`; do +cat <<-EOF > $expdir/log/wpe.$n.sh +while read line; do + $miniconda_dir/bin/python local/run_wpe.py \ + --file \$line +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/wpe.*.sh +$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \ + $expdir/log/wpe.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime6/s5_track1/local/score.sh b/egs/chime6/s5_track1/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/chime6/s5_track1/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5_track1/local/score_for_submit.sh b/egs/chime6/s5_track1/local/score_for_submit.sh new file mode 100755 index 00000000000..ba7d6cde574 --- /dev/null +++ b/egs/chime6/s5_track1/local/score_for_submit.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Copyright 2019 Johns Hopkins University (Author: Shinji Watanabe) +# Apache 2.0 +# +# This script provides official CHiME-6 challenge track 1 submission scores per room and session. +# It first calculates the best search parameter configurations by using the dev set +# and also create the transcriptions for dev and eval sets to be submitted. +# The default setup does not calculate scores of the evaluation set since +# the evaluation transcription is not distributed (July 9 2018) + +cmd=run.pl +dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref +eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref +do_eval=true +enhancement=gss +json= + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 0 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)]" + echo "This script provides official CHiME-6 challenge submission scores" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --dev # dev set decoding directory" + echo " --eval # eval set decoding directory" + echo " --enhancement # enhancement type (gss or beamformit)" + echo " --json # directory containing CHiME-6 json files" + exit 1; +fi + +# get language model weight and word insertion penalty from the dev set +best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt` +best_wip=`cat $dev/scoring_kaldi/wer_details/wip` + +echo "best LM weight: $best_lmwt" +echo "insertion penalty weight: $best_wip" + +echo "==== development set ====" +# development set +# get uttid location mapping +local/add_location_to_uttid.sh --enhancement $enhancement $json/dev \ + $dev/scoring_kaldi/wer_details/ $dev/scoring_kaldi/wer_details/uttid_location +# get the scoring result per utterance +score_result=$dev/scoring_kaldi/wer_details/per_utt_loc + +for session in S02 S09; do + for room in DINING KITCHEN LIVING; do + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + done +done +echo -n "overall: " +# get nerror +nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` +# get nwords from references (NF-2 means to exclude utterance id and " ref ") +nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` +# compute wer with scale=2 +wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` +echo -n "#words $nwrd, " +echo -n "#errors $nerr, " +echo "wer $wer %" + +echo "==== evaluation set ====" +# evaluation set +# get the scoring result per utterance. Copied from local/score.sh +mkdir -p $eval/scoring_kaldi/wer_details_devbest +$cmd $eval/scoring_kaldi/log/stats1.log \ + cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt + +local/add_location_to_uttid.sh --enhancement $enhancement $json/eval \ + $eval/scoring_kaldi/wer_details_devbest/ $eval/scoring_kaldi/wer_details_devbest/uttid_location + +score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt_loc +for session in S01 S21; do + for room in DINING KITCHEN LIVING; do + if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + fi + done +done +if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + echo -n "overall: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" +else + echo "skip evaluation scoring" + echo "" + echo "==== when you submit your result to the CHiME-6 challenge track 1 ====" + echo "Please rename your recognition results of " + echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "with {dev,eval}__.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, " + echo "and submit both of them as your final challenge result" + echo "==================================================================" +fi + diff --git a/egs/chime6/s5_track1/local/train_lms_srilm.sh b/egs/chime6/s5_track1/local/train_lms_srilm.sh new file mode 100755 index 00000000000..5a1d56d24b3 --- /dev/null +++ b/egs/chime6/s5_track1/local/train_lms_srilm.sh @@ -0,0 +1,261 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe) +# Apache 2.0 + +export LC_ALL=C + +# Begin configuration section. +words_file= +train_text= +dev_text= +oov_symbol="" +# End configuration section + +echo "$0 $@" + +[ -f path.sh ] && . ./path.sh +. ./utils/parse_options.sh || exit 1 + +echo "-------------------------------------" +echo "Building an SRILM language model " +echo "-------------------------------------" + +if [ $# -ne 2 ] ; then + echo "Incorrect number of parameters. " + echo "Script has to be called like this:" + echo " $0 [switches] " + echo "For example: " + echo " $0 data data/srilm" + echo "The allowed switches are: " + echo " words_file= word list file -- data/lang/words.txt by default" + echo " train_text= data/train/text is used in case when not specified" + echo " dev_text= last 10 % of the train text is used by default" + echo " oov_symbol=> symbol to use for oov modeling -- by default" + exit 1 +fi + +datadir=$1 +tgtdir=$2 + +##End of configuration +loc=`which ngram-count`; +if [ -z $loc ]; then + echo >&2 "You appear to not have SRILM tools installed, either on your path," + echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it." + exit 1 +fi + +# Prepare the destination directory +mkdir -p $tgtdir + +for f in $words_file $train_text $dev_text; do + [ ! -s $f ] && echo "No such file $f" && exit 1; +done + +[ -z $words_file ] && words_file=$datadir/lang/words.txt +if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then + nr=`cat $train_text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + orig_train_text=$train_text + head -n $nr_train $train_text > $tgtdir/train_text + tail -n $nr_dev $train_text > $tgtdir/dev_text + + train_text=$tgtdir/train_text + dev_text=$tgtdir/dev_text + echo "Using words file: $words_file" + echo "Using train text: 9/10 of $orig_train_text" + echo "Using dev text : 1/10 of $orig_train_text" +elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + train_text=$train_text + dev_text=$dev_text +else + train_text=$datadir/train/text + dev_text=$datadir/dev2h/text + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + +fi + +[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1 +[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1 +[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1 + + +# Extract the word list from the training dictionary; exclude special symbols +sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '' | grep -v -F "$oov_symbol" > $tgtdir/vocab +if (($?)); then + echo "Failed to create vocab from $words_file" + exit 1 +else + # wc vocab # doesn't work due to some encoding issues + echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +# We also have to avoid skewing the LM by incorporating the same sentences +# from different channels +sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt +if (($?)); then + echo "Failed to create $tgtdir/train.txt from $train_text" + exit 1 +else + echo "Removed first word (uid) from every line of $train_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt +if (($?)); then + echo "Failed to create $tgtdir/dev.txt from $dev_text" + exit 1 +else + echo "Removed first word (uid) from every line of $dev_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + + +echo "-------------------" +echo "Good-Turing 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + + +echo "-------------------" +echo "Good-Turing 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.gt0111.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0112.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0122.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0123.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0113.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0222.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0223.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.kn0111.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0112.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0113.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0122.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0123.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0222.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0223.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +if [ ! -z ${LIBLBFGS} ]; then + #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault + #instead of that, we simply output the model in the maxent format and convert it using the "ngram" + echo "-------------------" + echo "Maxent 3grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 4grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1 +else + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; +fi + + +echo "--------------------" +echo "Computing perplexity" +echo "--------------------" +( + for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done + for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done +) | sort -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt + +echo "The perlexity scores report is stored in $tgtdir/perplexities.txt " +echo "" + +for best_ngram in {3,4}gram ; do + outlm=best_${best_ngram}.gz + lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ') + echo "$outlm -> $lmfilename" + (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm ) +done diff --git a/egs/chime6/s5_track1/local/wer_output_filter b/egs/chime6/s5_track1/local/wer_output_filter new file mode 100755 index 00000000000..6f4b6400716 --- /dev/null +++ b/egs/chime6/s5_track1/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal ) +# Apache 2.0 + + +## Filter for scoring of the STT results. Convert everything to lowercase +## and add some ad-hoc fixes for the hesitations + +perl -e ' + while() { + @A = split(" ", $_); + $id = shift @A; print "$id "; + foreach $a (@A) { + print lc($a) . " " unless $a =~ /\[.*\]/; + } + print "\n"; + }' | \ +sed -e ' + s/\/hmm/g; + s/\/hmm/g; + s/\/hmm/g; +' + +#| uconv -f utf-8 -t utf-8 -x Latin-ASCII + diff --git a/egs/chime6/s5_track1/path.sh b/egs/chime6/s5_track1/path.sh new file mode 100644 index 00000000000..fb1c0489386 --- /dev/null +++ b/egs/chime6/s5_track1/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/chime6/s5_track1/run.sh b/egs/chime6/s5_track1/run.sh new file mode 100755 index 00000000000..0890a939faf --- /dev/null +++ b/egs/chime6/s5_track1/run.sh @@ -0,0 +1,280 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# + +# Begin configuration section. +nj=96 +decode_nj=20 +stage=0 +nnet_stage=-10 +decode_stage=1 +decode_only=false +num_data_reps=4 +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" +enhancement=beamformit # gss or beamformit + +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +if [ $decode_only == "true" ]; then + stage=16 +fi + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +if [[ ${enhancement} == *gss* ]]; then + enhanced_dir=${enhanced_dir}_multiarray + enhancement=${enhancement}_multiarray +fi + +if [[ ${enhancement} == *beamformit* ]]; then + enhanced_dir=${enhanced_dir} + enhancement=${enhancement} +fi + +test_sets="dev_${enhancement} eval_${enhancement}" +train_set=train_worn_simu_u400k + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +########################################################################### +# We prepare dict and lang in stages 1 to 3. +########################################################################### + +if [ $stage -le 1 ]; then + echo "$0: prepare data..." + # skip u03 and u04 as they are missing + for mictype in worn u01 u02 u05 u06; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/train ${json_dir}/train data/train_${mictype} + done + for dataset in dev; do + for mictype in worn; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/${dataset} ${json_dir}/${dataset} \ + data/${dataset}_${mictype} + done + done +fi + +if [ $stage -le 2 ]; then + echo "$0: train lm ..." + local/prepare_dict.sh + + utils/prepare_lang.sh \ + data/local/dict "" data/local/lang data/lang + + local/train_lms_srilm.sh \ + --train-text data/train_worn/text --dev-text data/dev_worn/text \ + --oov-symbol "" --words-file data/lang/words.txt \ + data/ data/srilm +fi + +LM=data/srilm/best_3gram.gz +if [ $stage -le 3 ]; then + # Compiles G for chime5 trigram LM + echo "$0: prepare lang..." + utils/format_lm.sh \ + data/lang $LM data/local/dict/lexicon.txt data/lang + +fi + +######################################################################################### +# In stages 4 to 7, we augment and fix train data for our training purpose. point source +# noises are extracted from chime corpus. Here we use 400k utterances from array microphones, +# its augmentation and all the worn set utterances in train. +######################################################################################### + +if [ $stage -le 4 ]; then + # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details + utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up + grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text + utils/fix_data_dir.sh data/train_worn +fi + +if [ $stage -le 5 ]; then + local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \ + local/distant_audio_list distant_noises + local/make_noise_list.py distant_noises > distant_noise_list + + noise_list=distant_noise_list + + if [ ! -d RIRS_NOISES/ ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters $noise_list) + + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 1 \ + --source-sampling-rate 16000 \ + data/train_worn data/train_worn_rvb +fi + +if [ $stage -le 6 ]; then + # combine mix array and worn mics + # randomly extract first 400k utterances from all mics + # if you want to include more training data, you can increase the number of array mic utterances + utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06 + utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k + utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k + + # only use left channel for worn mic recognition + # you can use both left and right channels for training + for dset in train dev; do + utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo + grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text + utils/fix_data_dir.sh data/${dset}_worn + done +fi + +if [ $stage -le 7 ]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${train_set}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset} + done +fi + +################################################################################## +# Now make 13-dim MFCC features. We use 13-dim fetures for GMM-HMM systems. +################################################################################## + +if [ $stage -le 8 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + echo "$0: make features..." + mfccdir=mfcc + for x in ${train_set}; do + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x + done +fi + +################################################################################### +# Stages 9 to 13 train monophone and triphone models. They will be used for +# generating lattices for training the chain model +################################################################################### + +if [ $stage -le 9 ]; then + # make a subset for monophone training + utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort + utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort +fi + +if [ $stage -le 10 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_30kshort data/lang exp/mono +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 +fi + +if [ $stage -le 12 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 13 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 +fi + +####################################################################### +# Perform data cleanup for training data. +####################################################################### + +if [ $stage -le 14 ]; then + # The following script cleans the data and produces cleaned data + steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \ + --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ + data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned +fi + +########################################################################## +# CHAIN MODEL TRAINING +# skipping decoding here and performing it in step 16 +########################################################################## + +if [ $stage -le 15 ]; then + # chain TDNN + local/chain/run_tdnn.sh --nj ${nj} \ + --stage $nnet_stage \ + --train-set ${train_set}_cleaned \ + --test-sets "$test_sets" \ + --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb +fi + +########################################################################## +# DECODING is done in the local/decode.sh script. This script performs +# enhancement, fixes test sets performs feature extraction and 2 stage decoding +########################################################################## + +if [ $stage -le 16 ]; then + local/decode.sh --stage $decode_stage \ + --enhancement $enhancement \ + --train_set "$train_set" +fi + +exit 0; diff --git a/egs/chime6/s5_track1/steps b/egs/chime6/s5_track1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/chime6/s5_track1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/chime6/s5_track1/utils b/egs/chime6/s5_track1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/chime6/s5_track1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/chime6/s5_track2/RESULTS b/egs/chime6/s5_track2/RESULTS new file mode 100644 index 00000000000..eacee196584 --- /dev/null +++ b/egs/chime6/s5_track2/RESULTS @@ -0,0 +1,19 @@ +# Results for Chime-6 track 2 for dev and eval, using pretrained models +# available at http://kaldi-asr.org/models/m12. + +# Speech Activity Detection (SAD) + Missed speech False alarm Total error +Dev 4.3 2.1 6.4 +Eval 5.6 5.9 11.5 + +# The results for the remaining pipeline are only for array U06. + +# Diarization + DER JER +Dev 31.37 20.45 +Eval 30.67 18.97 + +# ASR nnet3 tdnn+chain +Dev: U06 58881 48061 81.62 +Eval: U06 55132 47184 85.58 + diff --git a/egs/chime6/s5_track2/cmd.sh b/egs/chime6/s5_track2/cmd.sh new file mode 100644 index 00000000000..86514d94d4d --- /dev/null +++ b/egs/chime6/s5_track2/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/chime6/s5_track2/conf/beamformit.cfg b/egs/chime6/s5_track2/conf/beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime6/s5_track2/conf/beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime6/s5_track2/conf/mfcc.conf b/egs/chime6/s5_track2/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/chime6/s5_track2/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/chime6/s5_track2/conf/mfcc_hires.conf b/egs/chime6/s5_track2/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/chime6/s5_track2/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/chime6/s5_track2/conf/online_cmvn.conf b/egs/chime6/s5_track2/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/chime6/s5_track2/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/chime6/s5_track2/conf/sad.conf b/egs/chime6/s5_track2/conf/sad.conf new file mode 100644 index 00000000000..752bb1cf6c5 --- /dev/null +++ b/egs/chime6/s5_track2/conf/sad.conf @@ -0,0 +1,2 @@ +affix=_1a +nnet_type=stats diff --git a/egs/chime6/s5_track2/diarization b/egs/chime6/s5_track2/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/chime6/s5_track2/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/chain b/egs/chime6/s5_track2/local/chain new file mode 120000 index 00000000000..dd7910711d1 --- /dev/null +++ b/egs/chime6/s5_track2/local/chain @@ -0,0 +1 @@ +../../s5_track1/local/chain/ \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/check_tools.sh b/egs/chime6/s5_track2/local/check_tools.sh new file mode 120000 index 00000000000..4e835e887f2 --- /dev/null +++ b/egs/chime6/s5_track2/local/check_tools.sh @@ -0,0 +1 @@ +../../s5_track1/local/check_tools.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py b/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py new file mode 100755 index 00000000000..410dced190c --- /dev/null +++ b/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python +# Copyright 2019 Vimal Manohar +# Apache 2.0. + +"""This script converts an RTTM with +speaker info into kaldi utt2spk and segments""" + +import argparse + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script converts an RTTM with + speaker info into kaldi utt2spk and segments""") + parser.add_argument("--use-reco-id-as-spkr", type=str, + choices=["true", "false"], default="false", + help="Use the recording ID based on RTTM and " + "reco2file_and_channel as the speaker") + parser.add_argument("--append-reco-id-to-spkr", type=str, + choices=["true", "false"], default="false", + help="Append recording ID to the speaker ID") + + parser.add_argument("rttm_file", type=str, + help="""Input RTTM file. + The format of the RTTM file is + """ + """ """) + parser.add_argument("reco2file_and_channel", type=str, + help="""Input reco2file_and_channel. + The format is .""") + parser.add_argument("utt2spk", type=str, + help="Output utt2spk file") + parser.add_argument("segments", type=str, + help="Output segments file") + + args = parser.parse_args() + + args.use_reco_id_as_spkr = bool(args.use_reco_id_as_spkr == "true") + args.append_reco_id_to_spkr = bool(args.append_reco_id_to_spkr == "true") + + if args.use_reco_id_as_spkr: + if args.append_reco_id_to_spkr: + raise Exception("Appending recording ID to speaker does not make sense when using --use-reco-id-as-spkr=true") + + return args + +def main(): + args = get_args() + + file_and_channel2reco = {} + utt2spk={} + segments={} + for line in open(args.reco2file_and_channel): + parts = line.strip().split() + file_and_channel2reco[(parts[1], parts[2])] = parts[0] + + utt2spk_writer = open(args.utt2spk, 'w') + segments_writer = open(args.segments, 'w') + for line in open(args.rttm_file): + parts = line.strip().split() + if parts[0] != "SPEAKER": + continue + + file_id = parts[1] + channel = parts[2] + + try: + reco = file_and_channel2reco[(file_id, channel)] + except KeyError as e: + raise Exception("Could not find recording with " + "(file_id, channel) " + "= ({0},{1}) in {2}: {3}\n".format( + file_id, channel, + args.reco2file_and_channel, str(e))) + + start_time = float(parts[3]) + end_time = start_time + float(parts[4]) + + if args.use_reco_id_as_spkr: + spkr = reco + else: + if args.append_reco_id_to_spkr: + spkr = reco + "-" + parts[7] + else: + spkr = parts[7] + + st = int(start_time * 100) + end = int(end_time * 100) + utt = "{0}-{1:06d}-{2:06d}".format(spkr, st, end) + utt2spk[utt]=spkr + segments[utt]=(reco, start_time, end_time) + + for uttid_id in sorted(utt2spk): + utt2spk_writer.write("{0} {1}\n".format(uttid_id, utt2spk[uttid_id])) + segments_writer.write("{0} {1} {2:7.2f} {3:7.2f}\n".format( + uttid_id, segments[uttid_id][0], segments[uttid_id][1], segments[uttid_id][2])) + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh b/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh new file mode 120000 index 00000000000..a168a917d92 --- /dev/null +++ b/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh @@ -0,0 +1 @@ +../../s5_track1/local/copy_lat_dir_parallel.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/decode.sh b/egs/chime6/s5_track2/local/decode.sh new file mode 100755 index 00000000000..66a96fce37a --- /dev/null +++ b/egs/chime6/s5_track2/local/decode.sh @@ -0,0 +1,173 @@ +#!/bin/bash +# +# This script decodes raw utterances through the entire pipeline: +# Feature extraction -> SAD -> Diarization -> ASR +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora +# Apache 2.0 + +# Begin configuration section. +nj=8 +decode_nj=10 +stage=0 +sad_stage=0 +diarizer_stage=0 +decode_diarize_stage=0 +score_stage=0 +enhancement=beamformit + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +enhanced_dir=enhanced +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1 + +# training data +train_set=train_worn_simu_u400k +test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" + +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh +. ./conf/sad.conf + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +####################################################################### +# Prepare the dev and eval data with dereverberation (WPE) and +# beamforming. +####################################################################### +if [ $stage -le 1 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + dereverb_dir=${PWD}/wav/wpe/ + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + # Note that for the evaluation sets, we use the flag + # "--train false". This keeps the files segments, text, + # and utt2spk with .bak extensions, so that they can + # be used later for scoring if needed but are not used + # in the intermediate stages. + for dset in dev eval; do + local/prepare_data.sh --mictype ref --train false \ + "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb + done +fi + +if [ $stage -le 2 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + data/$x exp/make_mfcc/$x $mfccdir + done +fi + +####################################################################### +# Perform SAD on the dev/eval data +####################################################################### +dir=exp/segmentation${affix} +sad_work_dir=exp/sad${affix}_${nnet_type}/ +sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a + +if [ $stage -le 3 ]; then + for datadir in ${test_sets}; do + test_set=data/${datadir} + if [ ! -f ${test_set}/wav.scp ]; then + echo "$0: Not performing SAD on ${test_set}" + exit 0 + fi + # Perform segmentation + local/segmentation/detect_speech_activity.sh --nj $decode_nj --stage $sad_stage \ + $test_set $sad_nnet_dir mfcc $sad_work_dir \ + data/${datadir} || exit 1 + + mv data/${datadir}_seg data/${datadir}_${nnet_type}_seg + # Generate RTTM file from segmentation performed by SAD. This can + # be used to evaluate the performance of the SAD as an intermediate + # step. + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + data/${datadir}_${nnet_type}_seg/utt2spk data/${datadir}_${nnet_type}_seg/segments \ + data/${datadir}_${nnet_type}_seg/rttm + done +fi + +####################################################################### +# Perform diarization on the dev/eval data +####################################################################### +if [ $stage -le 4 ]; then + for datadir in ${test_sets}; do + local/diarize.sh --nj 10 --cmd "$train_cmd" --stage $diarizer_stage \ + exp/xvector_nnet_1a \ + data/${datadir}_${nnet_type}_seg \ + exp/${datadir}_${nnet_type}_seg_diarization + done +fi + +####################################################################### +# Decode diarized output using trained chain model +####################################################################### +if [ $stage -le 5 ]; then + for datadir in ${test_sets}; do + local/decode_diarized.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ + exp/${datadir}_${nnet_type}_seg_diarization data/$datadir data/lang_chain \ + exp/chain_${train_set}_cleaned_rvb exp/nnet3_${train_set}_cleaned_rvb \ + data/${datadir}_diarized + done +fi + +####################################################################### +# Score decoded dev/eval sets +####################################################################### +if [ $stage -le 6 ]; then + for datadir in ${test_sets}; do + local/multispeaker_score.sh --cmd "$train_cmd" --stage $score_stage \ + --datadir $datadir data/${datadir}_diarized_hires/text \ + exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_${datadir}_diarized_2stage/scoring_kaldi/penalty_1.0/10.txt \ + exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_${datadir}_diarized_2stage/scoring_kaldi_multispeaker + done +fi +exit 0; diff --git a/egs/chime6/s5_track2/local/decode_diarized.sh b/egs/chime6/s5_track2/local/decode_diarized.sh new file mode 100755 index 00000000000..2d0ad6a3b95 --- /dev/null +++ b/egs/chime6/s5_track2/local/decode_diarized.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Copyright 2019 Ashish Arora, Vimal Manohar +# Apache 2.0. +# This script takes an rttm file, and performs decoding on on a test directory. +# The output directory contains a text file which can be used for scoring. + + +stage=0 +nj=8 +cmd=queue.pl +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 6 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain_train_worn_simu_u400k_cleaned_rvb \ + exp/nnet3_train_worn_simu_u400k_cleaned_rvb data/dev_diarized" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +rttm_dir=$1 +data_in=$2 +lang_dir=$3 +asr_model_dir=$4 +ivector_extractor=$5 +out_dir=$6 + +for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \ + $lang_dir/L.fst $asr_model_dir/tree_sp/graph/HCLG.fst \ + $asr_model_dir/tdnn1b_sp/final.mdl; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0 copying data files in output directory" + cp $rttm_dir/rttm $rttm_dir/rttm_1 + sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1 + mkdir -p ${out_dir}_hires + cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires + utils/data/get_reco2dur.sh ${out_dir}_hires +fi + +if [ $stage -le 1 ]; then + echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_1 \ + <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_1 |sort -u) \ + ${out_dir}_hires/utt2spk ${out_dir}_hires/segments + + utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt + + awk '{print $1" "$1" 1"}' ${out_dir}_hires/wav.scp > ${out_dir}_hires/reco2file_and_channel + utils/fix_data_dir.sh ${out_dir}_hires || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0 extracting mfcc freatures using segments file" + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd queue.pl ${out_dir}_hires + steps/compute_cmvn_stats.sh ${out_dir}_hires + cp $data_in/text.bak ${out_dir}_hires/text +fi + +if [ $stage -le 3 ]; then + echo "$0 performing decoding on the extracted features" + local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \ + $out_dir $lang_dir $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ +fi + diff --git a/egs/chime6/s5_track2/local/diarize.sh b/egs/chime6/s5_track2/local/diarize.sh new file mode 100755 index 00000000000..2ca95dc0fbc --- /dev/null +++ b/egs/chime6/s5_track2/local/diarize.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Copyright 2019 David Snder +# Apache 2.0. +# +# This script takes an input directory that has a segments file (and +# a feats.scp file), and performs diarization on it. The output directory +# contains an RTTM file which can be used to resegment the input data. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref-rttm # if present, used to score output RTTM." + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0: keeping only data corresponding to array U06 " + echo "$0: we can skip this stage, to perform diarization on all arrays " + cp -r data/$name data/${name}.bak + mv data/$name/wav.scp data/$name/wav.scp.bak + grep 'U06' data/$name/wav.scp.bak > data/$name/wav.scp + utils/fix_data_dir.sh data/$name + nj=2 # since we have reduced number of "speakers" now +fi + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform PLDA scoring +if [ $stage -le 3 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + echo "$0: performing PLDA scoring between all pairs of x-vectors" + diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \ + --target-energy 0.5 \ + --nj $nj $model_dir/ $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/plda_scores +fi + +if [ $stage -le 4 ]; then + echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)" + awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk + diarization/cluster.sh --cmd "$cmd" --nj $nj \ + --reco2num-spk data/$name/reco2num_spk \ + --rttm-channel 1 \ + $out_dir/xvectors_${name}/plda_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +if [ $stage -le 5 ]; then + if [ -f $ref_rttm ]; then + echo "$0: computing diariztion error rate (DER) using reference ${ref_rttm}" + mkdir -p $out_dir/tuning/ + md-eval.pl -c 0.25 -1 -r $ref_rttm -s $out_dir/rttm 2> $out_dir/log/der.log > $out_dir/der + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' ${out_dir}/der) + echo "DER: $der%" + fi +fi + diff --git a/egs/chime6/s5_track2/local/distant_audio_list b/egs/chime6/s5_track2/local/distant_audio_list new file mode 120000 index 00000000000..0455876cf4d --- /dev/null +++ b/egs/chime6/s5_track2/local/distant_audio_list @@ -0,0 +1 @@ +../../s5_track1/local/distant_audio_list \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/extract_noises.py b/egs/chime6/s5_track2/local/extract_noises.py new file mode 120000 index 00000000000..04a6389916d --- /dev/null +++ b/egs/chime6/s5_track2/local/extract_noises.py @@ -0,0 +1 @@ +../../s5_track1/local/extract_noises.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/extract_vad_weights.sh b/egs/chime6/s5_track2/local/extract_vad_weights.sh new file mode 120000 index 00000000000..0db29cded5d --- /dev/null +++ b/egs/chime6/s5_track2/local/extract_vad_weights.sh @@ -0,0 +1 @@ +../../s5_track1/local/extract_vad_weights.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/gen_aligned_hyp.py b/egs/chime6/s5_track2/local/gen_aligned_hyp.py new file mode 100755 index 00000000000..acaa3a13ad5 --- /dev/null +++ b/egs/chime6/s5_track2/local/gen_aligned_hyp.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# Copyright 2019 Yusuke Fujita +# Apache 2.0. + +"""This script generates hypothesis utterances aligned with reference segments. + Usage: gen_align_hyp.py alignment.txt wc.txt > hyp.txt + alignment.txt is a session-level word alignment generated by align-text command. + wc.txt is a sequence of utt-id:reference_word_count generated by 'local/get_ref_perspeaker_persession_file.py'. +""" + +import sys, io +import string +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +def load_align_text(f): + alignments = {} + for line in f: + recoid, res = line.split(None, 1) + alignments[recoid] = [] + toks = res.split(';') + for tok in toks: + ref, hyp = tok.split() + alignments[recoid].append((ref, hyp)) + return alignments + +alignments = load_align_text(open(sys.argv[1],'r', encoding='utf8')) + +for line in open(sys.argv[2],'r', encoding='utf8'): + recoid, res = line.split(None, 1) + ali = iter(alignments[recoid]) + toks = res.split() + for tok in toks: + uttid, count = tok.split(':') + count = int(count) + text = '' + for i in range(count): + while True: + ref, hyp = ali.__next__() + if hyp != '': + text += ' ' + hyp + if ref != '': + break + output.write(uttid + ' ' + text.strip() + '\n') diff --git a/egs/chime6/s5_track2/local/generate_chime6_data.sh b/egs/chime6/s5_track2/local/generate_chime6_data.sh new file mode 120000 index 00000000000..62882cd6279 --- /dev/null +++ b/egs/chime6/s5_track2/local/generate_chime6_data.sh @@ -0,0 +1 @@ +../../s5_track1/local/generate_chime6_data.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/get_best_error.py b/egs/chime6/s5_track2/local/get_best_error.py new file mode 100755 index 00000000000..651a6322930 --- /dev/null +++ b/egs/chime6/s5_track2/local/get_best_error.py @@ -0,0 +1,86 @@ +#! /usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script finds best matching of reference and hypothesis speakers. + For the best matching speakers,it provides the WER for the reference session + (eg:S02) and hypothesis recording (eg: S02_U02)""" + +import itertools +import numpy as np +import argparse +from munkres import Munkres + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script finds best matching of reference and hypothesis speakers. + For the best matching it provides the WER""") + parser.add_argument("WER_dir", type=str, + help="path of WER files") + parser.add_argument("recording_id", type=str, + help="recording_id name") + args = parser.parse_args() + return args + + +def get_results(filename): + with open(filename) as f: + first_line = f.readline() + parts = first_line.strip().split(',') + total_words = parts[0].split()[-1] + ins = parts[1].split()[0] + deletions = parts[2].split()[0] + sub = parts[3].split()[0] + return total_words, ins, deletions, sub + + +def get_min_wer(recording_id, num_speakers, WER_dir): + best_wer_file = WER_dir + '/' + 'best_wer' + '_' + recording_id + best_wer_writer = open(best_wer_file, 'w') + m = Munkres() + total_error_mat = [0] * num_speakers + all_errors_mat = [0] * num_speakers + for i in range(num_speakers): + total_error_mat[i] = [0] * num_speakers + all_errors_mat[i] = [0] * num_speakers + for i in range(1, num_speakers+1): + for j in range(1, num_speakers+1): + filename = '/wer_' + recording_id + '_' + 'r' + str(i)+ 'h' + str(j) + filename = WER_dir + filename + total_words, ins, deletions, sub = get_results(filename) + ins = int(ins) + dele = int(deletions) + sub = int(sub) + total_error = ins + dele + sub + total_error_mat[i-1][j-1]=total_error + all_errors_mat[i-1][j-1]= (total_words, total_error, ins, dele, sub) + + indexes = m.compute(total_error_mat) + total_errors=total_words=total_ins=total_del=total_sub=0 + spk_order = '(' + for row, column in indexes: + words, errs, ins, dele, sub = all_errors_mat[row][column] + total_errors += int(errs) + total_words += int(words) + total_ins += int(ins) + total_del += int(deletions) + total_sub += int(sub) + spk_order = spk_order + str(column+1) + ', ' + spk_order = spk_order + ')' + text = "Best error: (#T #E #I #D #S) " + str(total_words)+ ', '+str(total_errors)+ ', '+str(total_ins)+ ', '+str(total_del)+ ', '+str(total_sub) + best_wer_writer.write(" recording_id: "+ recording_id + ' ') + best_wer_writer.write(' best hypothesis speaker order: ' + spk_order + ' ') + best_wer_writer.write(text+ '\n') + print("recording_id: "+ recording_id + ' ') + print('best hypothesis speaker order: ' + spk_order + ' ') + print(text) + best_wer_writer.close() + + +def main(): + args = get_args() + num_speakers = 4 + get_min_wer(args.recording_id, num_speakers, args.WER_dir) + + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py b/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py new file mode 100755 index 00000000000..7b3e14aaa49 --- /dev/null +++ b/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py @@ -0,0 +1,56 @@ +#! /usr/bin/env python +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script splits a kaldi (text) file + into per_array per_session per_speaker hypothesis (text) files""" + +import argparse +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_array per_session per_speaker text files""") + parser.add_argument("input_text_path", type=str, + help="path of text files") + parser.add_argument("output_dir_path", type=str, + help="Output path for per_array per_session per_speaker reference files") + args = parser.parse_args() + return args + + +def main(): + # S09_U06.ENH-4-704588-704738 + args = get_args() + sessionid_micid_speakerid_dict= {} + for line in open(args.input_text_path): + parts = line.strip().split() + uttid_id = parts[0] + temp = uttid_id.strip().split('.')[0] + micid = temp.strip().split('_')[1] + speakerid = uttid_id.strip().split('-')[1] + sessionid = uttid_id.strip().split('_')[0] + sessionid_micid_speakerid = sessionid + '_' + micid + '_' + speakerid + if sessionid_micid_speakerid not in sessionid_micid_speakerid_dict: + sessionid_micid_speakerid_dict[sessionid_micid_speakerid]=list() + sessionid_micid_speakerid_dict[sessionid_micid_speakerid].append(line) + + for sessionid_micid_speakerid in sorted(sessionid_micid_speakerid_dict): + hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + hyp_writer = open(hyp_file, 'w') + combined_hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + '_comb' + combined_hyp_writer = open(combined_hyp_file, 'w') + utterances = sessionid_micid_speakerid_dict[sessionid_micid_speakerid] + text = '' + for line in utterances: + parts = line.strip().split() + text = text + ' ' + ' '.join(parts[1:]) + hyp_writer.write(line) + combined_utterance = 'utt' + " " + text + combined_hyp_writer.write(combined_utterance) + combined_hyp_writer.write('\n') + combined_hyp_writer.close() + hyp_writer.close() + + +if __name__ == '__main__': + main() + diff --git a/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py b/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py new file mode 100755 index 00000000000..6b00e29e6b1 --- /dev/null +++ b/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script splits a kaldi (text) file + into per_speaker per_session reference (text) file""" + +import argparse + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_speaker per_session text files""") + parser.add_argument("input_text_path", type=str, + help="path of text file") + parser.add_argument("output_dir_path", type=str, + help="Output path for per_session per_speaker reference files") + args = parser.parse_args() + return args + + +def main(): + args = get_args() + sessionid_speakerid_dict= {} + spkrid_mapping = {} + for line in open(args.input_text_path): + parts = line.strip().split() + uttid_id = parts[0] + speakerid = uttid_id.strip().split('_')[0] + sessionid = uttid_id.strip().split('_')[1] + sessionid_speakerid = sessionid + '_' + speakerid + if sessionid_speakerid not in sessionid_speakerid_dict: + sessionid_speakerid_dict[sessionid_speakerid]=list() + sessionid_speakerid_dict[sessionid_speakerid].append(line) + + spkr_num = 1 + prev_sessionid = '' + for sessionid_speakerid in sorted(sessionid_speakerid_dict): + spkr_id = sessionid_speakerid.strip().split('_')[1] + curr_sessionid = sessionid_speakerid.strip().split('_')[0] + if prev_sessionid != curr_sessionid: + prev_sessionid = curr_sessionid + spkr_num = 1 + if spkr_id not in spkrid_mapping: + spkrid_mapping[spkr_id] = spkr_num + spkr_num += 1 + + for sessionid_speakerid in sorted(sessionid_speakerid_dict): + ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + ref_writer = open(ref_file, 'w') + wc_file = args.output_dir_path + '/ref_wc_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + wc_writer = open(wc_file, 'w') + combined_ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + '_comb' + combined_ref_writer = open(combined_ref_file, 'w') + utterances = sessionid_speakerid_dict[sessionid_speakerid] + text = '' + uttid_wc = 'utt' + for line in utterances: + parts = line.strip().split() + uttid_id = parts[0] + utt_text = ' '.join(parts[1:]) + text = text + ' ' + ' '.join(parts[1:]) + ref_writer.write(line) + length = str(len(utt_text.split())) + uttid_id_len = uttid_id + ":" + length + uttid_wc = uttid_wc + ' ' + uttid_id_len + combined_utterance = 'utt' + " " + text + combined_ref_writer.write(combined_utterance) + combined_ref_writer.write('\n') + combined_ref_writer.close() + wc_writer.write(uttid_wc) + wc_writer.write('\n') + wc_writer.close() + ref_writer.close() + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5_track2/local/install_pb_chime5.sh b/egs/chime6/s5_track2/local/install_pb_chime5.sh new file mode 120000 index 00000000000..ce5ea5f9f08 --- /dev/null +++ b/egs/chime6/s5_track2/local/install_pb_chime5.sh @@ -0,0 +1 @@ +../../s5_track1/local/install_pb_chime5.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/json2text.py b/egs/chime6/s5_track2/local/json2text.py new file mode 120000 index 00000000000..2aa0a8dd1f9 --- /dev/null +++ b/egs/chime6/s5_track2/local/json2text.py @@ -0,0 +1 @@ +../../s5_track1/local/json2text.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/make_noise_list.py b/egs/chime6/s5_track2/local/make_noise_list.py new file mode 120000 index 00000000000..d8dcc7822fc --- /dev/null +++ b/egs/chime6/s5_track2/local/make_noise_list.py @@ -0,0 +1 @@ +../../s5_track1/local/make_noise_list.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/make_voxceleb1.pl b/egs/chime6/s5_track2/local/make_voxceleb1.pl new file mode 100755 index 00000000000..2268c20ab52 --- /dev/null +++ b/egs/chime6/s5_track2/local/make_voxceleb1.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_test_dir = "$out_dir/voxceleb1_test"; +my $out_train_dir = "$out_dir/voxceleb1_train"; + +if (system("mkdir -p $out_test_dir") != 0) { + die "Error making directory $out_test_dir"; +} + +if (system("mkdir -p $out_train_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (! -e "$data_base/voxceleb1_test.txt") { + system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +} + +if (! -e "$data_base/vox1_meta.csv") { + system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); +} + +open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; +open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; +open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; +open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; +open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; +open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + +my %id2spkr = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split; + $id2spkr{$vox_id} = $spkr_id; +} + +my $test_spkrs = (); +while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $filename) = split('/', $path1); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $filename) = split('/', $path2); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; +} + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + if (exists $test_spkrs{$new_spkr_id}) { + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + } else { + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n"; + } + } +} + +close(SPKR_TEST) or die; +close(WAV_TEST) or die; +close(SPKR_TRAIN) or die; +close(WAV_TRAIN) or die; +close(TRIAL_OUT) or die; +close(TRIAL_IN) or die; +close(META_IN) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_test_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { + die "Error validating directory $out_test_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_train_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { + die "Error validating directory $out_train_dir"; +} diff --git a/egs/chime6/s5_track2/local/make_voxceleb2.pl b/egs/chime6/s5_track2/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/chime6/s5_track2/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/chime6/s5_track2/local/multispeaker_score.sh b/egs/chime6/s5_track2/local/multispeaker_score.sh new file mode 100755 index 00000000000..e632381ade8 --- /dev/null +++ b/egs/chime6/s5_track2/local/multispeaker_score.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright 2019 Ashish Arora, Yusuke Fujita +# Apache 2.0. +# This script takes a reference and hypothesis text file, and performs +# multispeaker scoring. + +stage=0 +cmd=queue.pl +num_spkrs=4 +num_hyp_spk=4 +datadir=dev_beamformit_dereverb +declare -a recording_id_array=("S02_U06" "S09_U06") +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/diarized/text data/dev \ + exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi/penalty_1.0/10.txt \ + exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi_multispeaker" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +ref_file=$1 +hyp_file=$2 +out_dir=$3 + +output_dir=$out_dir/per_speaker_output +wer_dir=$out_dir/per_speaker_wer + +# For dev and evaluation set, we take corresopnding arrays +if [[ ${datadir} == *dev* ]]; then + recording_id_array=("S02_U06" "S09_U06") +fi + +if [[ ${datadir} == *eval* ]]; then + recording_id_array=("S01_U06" "S21_U06") +fi + +for f in $ref_file $hyp_file; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0 generate per speaker per session file at paragraph level for the reference" + echo "and per speaker per array file at paraghaph level for the hypothesis" + mkdir -p $output_dir $wer_dir + local/wer_output_filter < $ref_file > $output_dir/ref_filt.txt + local/wer_output_filter < $hyp_file > $output_dir/hyp_filt.txt + local/get_ref_perspeaker_persession_file.py $output_dir/ref_filt.txt $output_dir + local/get_hyp_perspeaker_perarray_file.py $output_dir/hyp_filt.txt $output_dir +fi + +if [ $stage -le 1 ]; then + if [ $num_hyp_spk -le 3 ]; then + echo "$0 create dummy per speaker per array hypothesis files for if the" + echo " perdicted number of speakers by diarization is less than 4 " + for recording_id in "${recording_id_array[@]}"; do + for (( i=$num_hyp_spk+1; i<$num_spkrs+1; i++ )); do + echo 'utt ' > ${dir}/hyp_${recording_id}_${i}_comb + done + done + fi +fi + +if [ $stage -le 2 ]; then + echo "$0 calculate wer for each ref and hypothesis speaker" + for recording_id in "${recording_id_array[@]}"; do + for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do + ind_r=$((i / num_spkrs + 1)) + ind_h=$((i % num_spkrs + 1)) + sessionid="$(echo $recording_id | cut -d'_' -f1)" + + # compute WER with combined texts + compute-wer --text --mode=present ark:${output_dir}/ref_${sessionid}_${ind_r}_comb \ + ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb \ + > $wer_dir/wer_${recording_id}_r${ind_r}h${ind_h} 2>/dev/null + done + + local/get_best_error.py $wer_dir $recording_id + done +fi + +if [ $stage -le 3 ]; then + echo "$0 print best word error rate" + echo "$0 it will print best wer for each recording and each array" + cat $wer_dir/best_wer* > $wer_dir/all.txt + cat $wer_dir/all.txt | local/print_dset_error.py $output_dir/recordinid_spkorder +fi + +mkdir -p $wer_dir/wer_details $wer_dir/wer_details/log/ +if [ $stage -le 4 ]; then + echo "$0 generate per utterance wer details at utterance level" + while read -r line; + do + recording_id=$(echo "$line" | cut -f1 -d ":") + spkorder_str=$(echo "$line" | cut -f2 -d ":") + sessionid=$(echo "$line" | cut -f1 -d "_") + IFS='_' read -r -a spkorder_list <<< "$spkorder_str" + IFS=" " + ind_r=1 + for ind_h in "${spkorder_list[@]}"; do + + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_comb.log \ + align-text ark:${output_dir}/ref_${sessionid}_${ind_r}_comb ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb ark:$output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt + + # split hypothesis texts along with reference utterances using word alignment of combined texts + local/gen_aligned_hyp.py $output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt ${output_dir}/ref_wc_${sessionid}_${ind_r} > ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation + + ## compute per utterance alignments + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_per_utt.log \ + cat ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation \| \ + align-text --special-symbol="'***'" ark:${output_dir}/ref_${sessionid}_${ind_r} ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} || exit 1 + + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_ops.log \ + cat $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $wer_dir/wer_details/ops_${recording_id}_r${ind_r}h${ind_h} || exit 1; + + ind_r=$(( ind_r + 1 )) + done + done < $output_dir/recordinid_spkorder + echo "$0 done generating per utterance wer details" +fi + +echo "$0 done scoring" diff --git a/egs/chime6/s5_track2/local/nnet3/compare_wer.sh b/egs/chime6/s5_track2/local/nnet3/compare_wer.sh new file mode 120000 index 00000000000..87041e833d0 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/compare_wer.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/compare_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/decode.sh b/egs/chime6/s5_track2/local/nnet3/decode.sh new file mode 120000 index 00000000000..32595ccedbc --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/decode.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/decode.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh b/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh new file mode 120000 index 00000000000..4161993c225 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/run_ivector_common.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh new file mode 100755 index 00000000000..cb8fe2e6326 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and writes the features to disk. +# +# Although this kind of script isn't necessary in speaker recognition recipes, +# it can be helpful in the diarization recipes. The script +# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very +# short (e.g., 1-2 seconds) segments. Therefore, in order to apply the sliding +# window CMVN in a meaningful way, it must be performed prior to performing +# the subsegmentation. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp +for f in $data_in/segments $data_in/segments/vad.scp ; do + [ -f $f ] && cp $f $data_out/`basename $f`; +done + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..dcdbe1b1593 --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the x-vector system. Once the training examples are generated, the features +# created by this script can be removed. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh b/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..94fc7e7682f --- /dev/null +++ b/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2018 David Snyder +# 2018 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2018 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This script trains the x-vector DNN. The recipe is similar to the one +# described in "Diarization is Hard: Some Experiences and Lessons Learned +# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al. + +. ./cmd.sh +set -e + +stage=1 +train_stage=-1 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 6 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 500000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 40 \ + "$data" $egs_dir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 4 seconds. If the input recording is greater than 4 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=400 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=20 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=128 input=stats + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 8 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/chime6/s5_track2/local/prepare_data.sh b/egs/chime6/s5_track2/local/prepare_data.sh new file mode 100755 index 00000000000..c6b8121dab0 --- /dev/null +++ b/egs/chime6/s5_track2/local/prepare_data.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal) +# Apache 2.0 + +# Begin configuration section. +mictype=worn # worn, ref or others +cleanup=true +train=true + +# End configuration section +. ./utils/parse_options.sh # accept options.. you can run this run.sh with the + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" + exit 1 +fi + +set -e -o pipefail + +adir=$1 +jdir=$2 +dir=$3 + +json_count=$(find -L $jdir -name "*.json" | wc -l) +wav_count=$(find -L $adir -name "*.wav" | wc -l) + +if [ "$json_count" -eq 0 ]; then + echo >&2 "We expect that the directory $jdir will contain json files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi +if [ "$wav_count" -eq 0 ]; then + echo >&2 "We expect that the directory $adir will contain wav files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi + +echo "$0: Converting transcription to text" + +mkdir -p $dir +for file in $jdir/*json; do + ./local/json2text.py --mictype $mictype $file +done | \ + sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ + sed -e 's/ - / /g' |\ + sed -e 's/mm-/mm/g' > $dir/text.orig + +echo "$0: Creating datadir $dir for type=\"$mictype\"" + +if [ $mictype == "worn" ]; then + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key, add .L and .R for left and right channel + # i.e. each file will have two entries (left and right channel) + find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + @F = split "_", $f; + print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n"; + }' | sort > $dir/wav.scp + + # generate the transcripts for both left and right channel + # from the original transcript in the form + # P09_S03-0006072-0006147 gimme the baker + # create left and right channel transcript + # P09_S03.L-0006072-0006147 gimme the baker + # P09_S03.R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text +elif [ $mictype == "ref" ]; then + # fixed reference array + + # first get a text, which will be used to extract reference arrays + perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text + + find -L $adir | grep "\.wav" | sort > $dir/wav.flist + # following command provide the argument for grep to extract only reference arrays + #grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 + paste -d" " \ + <(awk -F "/" '{print $NF}' $dir/wav.flist | sed -e "s/\.wav/.ENH/") \ + $dir/wav.flist | sort > $dir/wav.scp +else + # array mic case + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key + find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ + perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ + sort -u > $dir/wav.scp + + # convert the transcripts from + # P09_S03-0006072-0006147 gimme the baker + # to the per-channel transcripts + # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker + perl -ne '$l=$_; + for($i=1; $i<=4; $i++) { + ($x=$l)=~ s/-/.CH\Q$i\E-/; + print $x;}' $dir/text.orig | sort > $dir/text + +fi +$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist + +# Prepare 'segments', 'utt2spk', 'spk2utt' +if [ $mictype == "worn" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" \ + > $dir/segments +elif [ $mictype == "ref" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e "s/ P.._/ /" > $dir/segments +else + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e 's/ P.._/ /' > $dir/segments +fi +cut -f 1 -d ' ' $dir/segments | \ + perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk + +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +if [ $train != 'true' ]; then + # For scoring the final system, we need the original utt2spk + # and text file. So we keep them with the extension .bak here + # so that they don't affect the validate_data_dir steps in + # the intermediate steps. + for file in text utt2spk spk2utt segments; do + mv $dir/$file $dir/$file.bak + done + + # For dev and eval data, prepare pseudo utt2spk. + awk '{print $1, $1}' $dir/wav.scp > $dir/utt2spk + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +fi diff --git a/egs/chime6/s5_track2/local/prepare_dict.sh b/egs/chime6/s5_track2/local/prepare_dict.sh new file mode 120000 index 00000000000..ada30947463 --- /dev/null +++ b/egs/chime6/s5_track2/local/prepare_dict.sh @@ -0,0 +1 @@ +../../s5_track1/local/prepare_dict.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/print_dset_error.py b/egs/chime6/s5_track2/local/print_dset_error.py new file mode 100755 index 00000000000..8d7988e2785 --- /dev/null +++ b/egs/chime6/s5_track2/local/print_dset_error.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. + +import sys, io +import string +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +spkorder_writer = open(sys.argv[1],'w', encoding='utf8') +total_words={} +total_errors={} +spk_order={} +total_errors_arrayid={} +total_words_arrayid={} + +output.write('WER for each recording: \n') +for line in infile: + toks = line.strip().split() + recordingid = toks[1] + total_words[recordingid] = toks[-5][:-1] + total_errors[recordingid] = toks[-4][:-1] + spk_order[recordingid] = toks[6][1] + '_' + toks[7][0] + '_' + toks[8][0] + '_' + toks[9][0] + arrayid=recordingid.strip().split('_')[1] + if arrayid not in total_errors_arrayid: + total_errors_arrayid[arrayid]=0 + total_words_arrayid[arrayid]=0 + total_errors_arrayid[arrayid]+=int(total_errors[recordingid]) + total_words_arrayid[arrayid]+=int(total_words[recordingid]) + wer = float(total_errors[recordingid])/float(total_words[recordingid])*100 + utt = "{0} {1} {2} {3} {4:5.2f}".format(recordingid, spk_order[recordingid], total_words[recordingid], total_errors[recordingid], wer) + output.write(utt + '\n') + spkorder_writer.write(recordingid + ':' + str(spk_order[recordingid]) + '\n') + + +output.write('WER for each array: \n') +for arrayid in sorted(total_errors_arrayid): + wer = float(total_errors_arrayid[arrayid])/float(total_words_arrayid[arrayid])*100 + utt = "{0} {1} {2} {3:5.2f}".format(arrayid, total_words_arrayid[arrayid], total_errors_arrayid[arrayid], wer) + output.write(utt + '\n') + diff --git a/egs/chime6/s5_track2/local/reverberate_lat_dir.sh b/egs/chime6/s5_track2/local/reverberate_lat_dir.sh new file mode 120000 index 00000000000..57302268f6d --- /dev/null +++ b/egs/chime6/s5_track2/local/reverberate_lat_dir.sh @@ -0,0 +1 @@ +../../s5_track1/local/reverberate_lat_dir.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_beamformit.sh b/egs/chime6/s5_track2/local/run_beamformit.sh new file mode 120000 index 00000000000..832a16e3ba7 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_beamformit.sh @@ -0,0 +1 @@ +../../s5_track1/local/run_beamformit.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_ivector_common.sh b/egs/chime6/s5_track2/local/run_ivector_common.sh new file mode 120000 index 00000000000..df7fca84335 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_ivector_common.sh @@ -0,0 +1 @@ +../../s5_track1/local/nnet3/run_ivector_common.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_wpe.py b/egs/chime6/s5_track2/local/run_wpe.py new file mode 120000 index 00000000000..6621607c932 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_wpe.py @@ -0,0 +1 @@ +../../s5_track1/local/run_wpe.py \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/run_wpe.sh b/egs/chime6/s5_track2/local/run_wpe.sh new file mode 120000 index 00000000000..187080e62e4 --- /dev/null +++ b/egs/chime6/s5_track2/local/run_wpe.sh @@ -0,0 +1 @@ +../../s5_track1/local/run_wpe.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/score.sh b/egs/chime6/s5_track2/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/chime6/s5_track2/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh b/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh new file mode 100755 index 00000000000..91d52b39269 --- /dev/null +++ b/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh @@ -0,0 +1,217 @@ +#!/bin/bash + +# Copyright 2016-17 Vimal Manohar +# 2017 Nagendra Kumar Goel +# Apache 2.0. + +# This script does nnet3-based speech activity detection given an input +# kaldi data directory and outputs a segmented kaldi data directory. +# This script can also do music detection and other similar segmentation +# using appropriate options such as --output-name output-music. + +set -e +set -o pipefail +set -u + +if [ -f ./path.sh ]; then . ./path.sh; fi + +affix= # Affix for the segmentation +nj=32 +cmd=queue.pl +stage=-1 + +# Feature options (Must match training) +mfcc_config=conf/mfcc_hires.conf +feat_affix= # Affix for the type of feature used + +output_name=output # The output node in the network +sad_name=sad # Base name for the directory storing the computed loglikes + # Can be music for music detection +segmentation_name=segmentation # Base name for the directory doing segmentation + # Can be segmentation_music for music detection + +# SAD network config +iter=final # Model iteration to use + +# Contexts must ideally match training for LSTM models, but +# may not necessarily for stats components +extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +frames_per_chunk=150 + +# Decoding options +graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" +acwt=0.3 + +# These _in__weight represent the fraction of probability +# to transfer to class. +# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3 +transform_probs_opts="" + +# Postprocessing options +segment_padding=0.2 # Duration (in seconds) of padding added to segments +min_segment_dur=0 # Minimum duration (in seconds) required for a segment to be included + # This is before any padding. Segments shorter than this duration will be removed. + # This is an alternative to --min-speech-duration above. +merge_consecutive_max_dur=0 # Merge consecutive segments as long as the merged segment is no longer than this many + # seconds. The segments are only merged if their boundaries are touching. + # This is after padding by --segment-padding seconds. + # 0 means do not merge. Use 'inf' to not limit the duration. + +echo $* + +. utils/parse_options.sh + +if [ $# -ne 5 ]; then + echo "This script does nnet3-based speech activity detection given an input kaldi " + echo "data directory and outputs an output kaldi data directory." + echo "See script for details of the options to be supplied." + echo "Usage: $0 " + echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\" + echo " mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" + echo "" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of parallel jobs to run." + echo " --stage # stage to do partial re-run from." + echo " --convert-data-dir-to-whole # If true, the input data directory is " + echo " # first converted to whole data directory (i.e. whole recordings) " + echo " # and segmentation is done on that." + echo " # If false, then the original segments are " + echo " # retained and they are split into sub-segments." + echo " --output-name # The output node in the network" + echo " --extra-left-context # Set to some large value, typically 40 for LSTM (must match training)" + echo " --extra-right-context # For BLSTM or statistics pooling" + exit 1 +fi + +src_data_dir=$1 # The input data directory that needs to be segmented. + # If convert_data_dir_to_whole is true, any segments in that will be ignored. +sad_nnet_dir=$2 # The SAD neural network +mfcc_dir=$3 # The directory to store the features +dir=$4 # Work directory +data_dir=$5 # The output data directory will be ${data_dir}_seg + +affix=${affix:+_$affix} +feat_affix=${feat_affix:+_$feat_affix} + +data_id=`basename $data_dir` +sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix} +seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix} +test_data_dir=data/${data_id}${feat_affix} + +############################################################################### +## Forward pass through the network network and dump the log-likelihoods. +############################################################################### + +frame_subsampling_factor=1 +if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor) +fi + +mkdir -p $dir +if [ $stage -le 1 ]; then + if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then + cp $sad_nnet_dir/cmvn_opts $dir || exit 1 + fi + + ######################################################################## + ## Initialize neural network for decoding using the output $output_name + ######################################################################## + + if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then + $cmd $dir/log/get_nnet_${output_name}.log \ + nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \ + $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1 + iter=${iter}_${output_name} + else + if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then + cp $sad_nnet_dir/$iter.raw $dir/ + fi + fi + + steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \ + --iter ${iter} \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk $frames_per_chunk --apply-exp true \ + --frame-subsampling-factor $frame_subsampling_factor \ + ${test_data_dir} $dir $sad_dir || exit 1 +fi + +############################################################################### +## Prepare FST we search to make speech/silence decisions. +############################################################################### + +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1 + +graph_dir=${dir}/graph_${output_name} +if [ $stage -le 2 ]; then + mkdir -p $graph_dir + + # 1 for silence and 2 for speech + cat < $graph_dir/words.txt + 0 +silence 1 +speech 2 +EOF + + $cmd $graph_dir/log/make_graph.log \ + steps/segmentation/internal/prepare_sad_graph.py $graph_opts \ + --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \ + fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \ + $graph_dir/HCLG.fst +fi + +############################################################################### +## Do Viterbi decoding to create per-frame alignments. +############################################################################### + +post_vec=$sad_nnet_dir/post_${output_name}.vec +if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then + if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then + echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. " + echo "Re-run the corresponding stage in the training script possibly " + echo "with --compute-average-posteriors=true or compute the priors " + echo "from the training labels" + exit 1 + else + post_vec=$sad_nnet_dir/post_${output_name}.txt + fi +fi + +mkdir -p $seg_dir +if [ $stage -le 3 ]; then + steps/segmentation/internal/get_transform_probs_mat.py \ + --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat + + steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \ + --nj $nj \ + --transform "$seg_dir/transform_probs.mat" \ + $graph_dir $sad_dir $seg_dir +fi + +############################################################################### +## Post-process segmentation to create kaldi data directory. +############################################################################### + +if [ $stage -le 4 ]; then + steps/segmentation/post_process_sad_to_segments.sh \ + --segment-padding $segment_padding --min-segment-dur $min_segment_dur \ + --merge-consecutive-max-dur $merge_consecutive_max_dur \ + --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \ + ${test_data_dir} ${seg_dir} ${seg_dir} +fi + +if [ $stage -le 5 ]; then + utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \ + ${data_dir}_seg +fi + +echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg" +exit 0 diff --git a/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh b/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh new file mode 100755 index 00000000000..5701424869a --- /dev/null +++ b/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using LSTM for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +extra_left_context=60 +extra_right_context=10 +relu_dim=256 +cell_dim=256 +projection_dim=64 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network +dropout_schedule='0,0@0.20,0.1@0.50,0' + +egs_dir= +nj=40 + +dir= +affix=1a + +data_dir= +targets_dir= + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_lstm_asr_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/cmvn_opts + +if [ $stage -le 5 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn4 input=Append(-6,0,6,12) add-log-stddev=true dim=$relu_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn5 input=Append(-12,0,12,24) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.optimization.shrink-value=0.99 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + $train_cmd $dir/log/get_priors.log \ + matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi diff --git a/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh b/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh new file mode 100755 index 00000000000..bb985462f49 --- /dev/null +++ b/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using statistics pooling for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +# The context is chosen to be around 1 second long. The context at test time +# is expected to be around the same. +extra_left_context=79 +extra_right_context=21 + +relu_dim=256 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network + +egs_dir= +nj=40 + +dir= +affix=1a + +data_dir= +targets_dir= + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_stats_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/cmvn_opts + +if [ $stage -le 5 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + stats-layer name=tdnn3_stats config=mean+count(-99:3:9:99) + relu-renorm-layer name=tdnn4 input=Append(tdnn3@-6,tdnn3@0,tdnn3@6,tdnn3@12,tdnn3_stats) add-log-stddev=true dim=$relu_dim + stats-layer name=tdnn4_stats config=mean+count(-108:6:18:108) + relu-renorm-layer name=tdnn5 input=Append(tdnn4@-12,tdnn4@0,tdnn4@12,tdnn4@24,tdnn4_stats) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts=$cmvn_opts \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + #$train_cmd $dir/log/get_priors.log \ + # matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + # ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + # Since the train data is individual microphones, while the dev and + # eval are beamformed, it is likely that the train contains a much + # higher ratio of silences. So using priors computed from the train + # data may miss a lot of speech in the dev/eval sets. Hence we manually + # tune the prior on the dev set. + # With the following prior, the SAD system results are: + # Dev (using -c 0.25) + # MISSED SPEECH = 1188.59 secs ( 3.3 percent of scored time) + # FALARM SPEECH = 539.37 secs ( 1.5 percent of scored time) + echo "[ 30 2 1 ]" > $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi + diff --git a/egs/chime6/s5_track2/local/train_diarizer.sh b/egs/chime6/s5_track2/local/train_diarizer.sh new file mode 100755 index 00000000000..71918e7cabc --- /dev/null +++ b/egs/chime6/s5_track2/local/train_diarizer.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# Copyright +# 2019 David Snyder +# Apache 2.0. +# +# This script is based on the run.sh script in the Voxceleb v2 recipe. +# It trains an x-vector DNN for diarization. + +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +data_dir=train_worn_simu_u400k +model_dir=exp/xvector_nnet_1a + +stage=0 +train_stage=-1 + +. ./cmd.sh + +if [ -f ./path.sh ]; then . ./path.sh; fi +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +if [ $stage -le 0 ]; then + echo "$0: preparing voxceleb 2 data" + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + echo "$0: preparing voxceleb 1 data (see comments if this step fails)" + # The format of the voxceleb 1 corpus has changed several times since it was + # released. Therefore, our dataprep scripts may or may not fail depending + # on the version of the corpus you obtained. + # If you downloaded the corpus soon after it was first released, this + # version of the dataprep script might work: + local/make_voxceleb1.pl $voxceleb1_root data/voxceleb1 + # However, if you've downloaded the corpus recently, you may need to use the + # the following scripts instead: + #local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + #local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + + # We should now have about 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/voxceleb data/voxceleb2_train data/voxceleb2_test +fi + +if [ $stage -le 1 ]; then + echo "$0: preparing features for training data (voxceleb 1 + 2)" + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/voxceleb + # Note that we apply CMN to the MFCCs and write these to the disk. These + # features will later be used to train the x-vector DNN. +fi + +# In this section, we augment the voxceleb data with reverberation. +# Note that we can probably improve the x-vector DNN if we include +# augmentations from the nonspeech regions of the Chime 6 training +# dataset. +if [ $stage -le 2 ]; then + echo "$0: applying augmentation to x-vector training data (just reverb for now)" + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/voxceleb/utt2num_frames > data/voxceleb/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + echo "$0: downloading simulated room impulse response dataset" + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the training data. Note that we don't add any + # additive noise here. + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/voxceleb data/voxceleb_reverb + utils/copy_data_dir.sh --utt-suffix "-reverb" data/voxceleb_reverb data/voxceleb_reverb.new + rm -rf data/voxceleb_reverb + mv data/voxceleb_reverb.new data/voxceleb_reverb +fi + +if [ $stage -le 3 ]; then + echo "$0: making MFCCs for augmented training data" + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb_reverb exp/make_mfcc $mfccdir + # Combine the clean and augmented training data. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/voxceleb_combined data/voxceleb_reverb data/voxceleb +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 4 ]; then + # This script applies CMVN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating voxceleb examples, this can be removed. + echo "$0: preparing features to train x-vector DNN" + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/voxceleb_combined data/voxceleb_combined_cmn exp/voxceleb_combined_cmn + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +if [ $stage -le 5 ]; then + # Now, we need to remove features that are too short after removing silence + # frames. We want at least 4s (400 frames) per utterance. + min_len=400 + mv data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/voxceleb_combined_cmn/utt2num_frames.bak > data/voxceleb_combined_cmn/utt2num_frames + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2spk > data/voxceleb_combined_cmn/utt2spk.new + mv data/voxceleb_combined_cmn/utt2spk.new data/voxceleb_combined_cmn/utt2spk + utils/fix_data_dir.sh data/voxceleb_combined_cmn + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/voxceleb_combined_cmn/spk2num | utils/filter_scp.pl - data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2utt.new + mv data/voxceleb_combined_cmn/spk2utt.new data/voxceleb_combined_cmn/spk2utt + utils/spk2utt_to_utt2spk.pl data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/utt2spk + + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2spk data/voxceleb_combined_cmn/utt2num_frames > data/voxceleb_combined_cmn/utt2num_frames.new + mv data/voxceleb_combined_cmn/utt2num_frames.new data/voxceleb_combined_cmn/utt2num_frames + + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +# Stages 6 through 8 are handled in run_xvector.sh. +# This script trains the x-vector DNN on the augmented voxceleb data. +local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage $train_stage \ + --data data/voxceleb_combined_cmn --nnet-dir $model_dir \ + --egs-dir $model_dir/egs + +if [ $stage -le 9 ]; then + echo "$0: preparing a subset of Chime 6 training data to train PLDA model" + utils/subset_data_dir.sh ${data_dir} 100000 data/plda_train + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/plda_train exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/plda_train + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/plda_train data/plda_train_cmn exp/plda_train_cmn + if [ -f data/plda_train/segments ]; then + cp data/plda_train/segments data/plda_train_cmn/ + fi +fi + +if [ $stage -le 10 ]; then + echo "$0: extracting x-vector for PLDA training data" + utils/fix_data_dir.sh data/plda_train_cmn + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \ + --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \ + --hard-min true $model_dir \ + data/plda_train_cmn $model_dir/xvectors_plda_train +fi + +# Train PLDA models +if [ $stage -le 11 ]; then + echo "$0: training PLDA model" + $train_cmd $model_dir/xvectors_plda_train/log/plda.log \ + ivector-compute-plda ark:$model_dir/xvectors_plda_train/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$model_dir/xvectors_plda_train/xvector.scp ark:- \ + | transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $model_dir/xvectors_plda_train/plda || exit 1; + cp $model_dir/xvectors_plda_train/plda $model_dir/ + cp $model_dir/xvectors_plda_train/transform.mat $model_dir/ + cp $model_dir/xvectors_plda_train/mean.vec $model_dir/ +fi diff --git a/egs/chime6/s5_track2/local/train_lms_srilm.sh b/egs/chime6/s5_track2/local/train_lms_srilm.sh new file mode 120000 index 00000000000..a7666f6cded --- /dev/null +++ b/egs/chime6/s5_track2/local/train_lms_srilm.sh @@ -0,0 +1 @@ +../../s5_track1/local/train_lms_srilm.sh \ No newline at end of file diff --git a/egs/chime6/s5_track2/local/train_sad.sh b/egs/chime6/s5_track2/local/train_sad.sh new file mode 100755 index 00000000000..e12a0cad694 --- /dev/null +++ b/egs/chime6/s5_track2/local/train_sad.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2017 Vimal Manohar +# 2019 Desh Raj +# Apache 2.0 + +# This script is based on local/run_asr_segmentation.sh script in the +# Aspire recipe. It demonstrates nnet3-based speech activity detection for +# segmentation. +# This script: +# 1) Prepares targets (per-frame labels) for a subset of training data +# using GMM models +# 2) Trains TDNN+Stats or TDNN+LSTM neural network using the targets +# 3) Demonstrates using the SAD system to get segments of dev data + +lang=data/lang # Must match the one used to train the models +lang_test=data/lang_test # Lang directory for decoding. + +data_dir= +test_sets= +# Model directory used to align the $data_dir to get target labels for training +# SAD. This should typically be a speaker-adapted system. +sat_model_dir= +# Model direcotry used to decode the whole-recording version of the $data_dir to +# get target labels for training SAD. This should typically be a +# speaker-independent system like LDA+MLLT system. +model_dir= +graph_dir= # Graph for decoding whole-recording version of $data_dir. + # If not provided, a new one will be created using $lang_test + +# List of weights on labels obtained from alignment; +# labels obtained from decoding; and default labels in out-of-segment regions +merge_weights=1.0,0.1,0.5 + +prepare_targets_stage=-10 +nstage=-10 +train_stage=-10 +stage=0 +nj=50 +reco_nj=40 + +# test options +test_nj=10 + +. ./cmd.sh +. ./conf/sad.conf + +if [ -f ./path.sh ]; then . ./path.sh; fi + +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +dir=exp/segmentation${affix} +sad_work_dir=exp/sad${affix}_${nnet_type}/ +sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a + +mkdir -p $dir +mkdir -p ${sad_work_dir} + +# See $lang/phones.txt and decide which should be garbage +garbage_phones="laughs inaudible" +silence_phones="sil spn noise" + +for p in $garbage_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/garbage_phones.txt + +for p in $silence_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/silence_phones.txt + +if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ + steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then + echo "$0: Invalid $dir/{silence,garbage}_phones.txt" + exit 1 +fi + +# The training data may already be segmented, so we first prepare +# a "whole" training data (not segmented) for training the SAD +# system. + +whole_data_dir=${data_dir}_whole +whole_data_id=$(basename $whole_data_dir) + +if [ $stage -le 0 ]; then + utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir +fi + +############################################################################### +# Extract features for the whole data directory. We extract 13-dim MFCCs to +# generate targets using the GMM system, and 40-dim MFCCs to train the NN-based +# SAD. +############################################################################### +if [ $stage -le 1 ]; then + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + --mfcc-config conf/mfcc.conf \ + $whole_data_dir exp/make_mfcc/${whole_data_id} + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id} + utils/fix_data_dir.sh $whole_data_dir + + utils/copy_data_dir.sh $whole_data_dir ${whole_data_dir}_hires + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf \ + ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires + steps/compute_cmvn_stats.sh ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires + utils/fix_data_dir.sh ${whole_data_dir}_hires +fi + +############################################################################### +# Prepare SAD targets for recordings +############################################################################### +targets_dir=$dir/${whole_data_id}_combined_targets_sub3 +if [ $stage -le 2 ]; then + steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ + --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ + --nj $nj --reco-nj $reco_nj --lang-test $lang \ + --garbage-phones-list $dir/garbage_phones.txt \ + --silence-phones-list $dir/silence_phones.txt \ + --merge-weights "$merge_weights" \ + --remove-mismatch-frames false \ + --graph-dir "$graph_dir" \ + $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir +fi + +############################################################################### +# Train a neural network for SAD +############################################################################### +if [ $stage -le 3 ]; then + if [ $nnet_type == "stats" ]; then + # Train a STATS-pooling network for SAD + local/segmentation/tuning/train_stats_sad_1a.sh \ + --stage $nstage --train-stage $train_stage \ + --targets-dir ${targets_dir} \ + --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1 + + elif [ $nnet_type == "lstm" ]; then + # Train a TDNN+LSTM network for SAD + local/segmentation/tuning/train_lstm_sad_1a.sh \ + --stage $nstage --train-stage $train_stage \ + --targets-dir ${targets_dir} \ + --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1 + + fi +fi + +exit 0; diff --git a/egs/chime6/s5_track2/local/wer_output_filter b/egs/chime6/s5_track2/local/wer_output_filter new file mode 120000 index 00000000000..12a6c616d3d --- /dev/null +++ b/egs/chime6/s5_track2/local/wer_output_filter @@ -0,0 +1 @@ +../../s5_track1/local/wer_output_filter \ No newline at end of file diff --git a/egs/chime6/s5_track2/path.sh b/egs/chime6/s5_track2/path.sh new file mode 100644 index 00000000000..c2526194bee --- /dev/null +++ b/egs/chime6/s5_track2/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/chime6/s5_track2/run.sh b/egs/chime6/s5_track2/run.sh new file mode 100755 index 00000000000..1350b8e14d5 --- /dev/null +++ b/egs/chime6/s5_track2/run.sh @@ -0,0 +1,296 @@ +#!/bin/bash +# +# Chime-6 Track 2 baseline. Based mostly on the Chime-5 recipe, with the exception +# that we are required to perform speech activity detection and speaker +# diarization before ASR, since we do not have access to the oracle SAD and +# diarization labels. +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora +# Apache 2.0 + +# Begin configuration section. +nj=50 +decode_nj=20 +stage=0 +nnet_stage=-10 +sad_stage=0 +diarizer_stage=0 +decode_stage=1 +enhancement=beamformit # for a new enhancement method, + # change this variable and decode stage +decode_only=false +num_data_reps=4 +snrs="20:10:15:5:0" +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +if [ $decode_only == "true" ]; then + stage=18 +fi + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +# training and test data +train_set=train_worn_simu_u400k +sad_train_set=train_worn_u400k +test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1; + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +########################################################################### +# We prepare dict and lang in stages 1 to 3. +########################################################################### + +if [ $stage -le 1 ]; then + # skip u03 and u04 as they are missing + for mictype in worn u01 u02 u05 u06; do + local/prepare_data.sh --mictype ${mictype} --train true \ + ${audio_dir}/train ${json_dir}/train data/train_${mictype} + done + for dataset in dev; do + for mictype in worn; do + local/prepare_data.sh --mictype ${mictype} --train true \ + ${audio_dir}/${dataset} ${json_dir}/${dataset} \ + data/${dataset}_${mictype} + done + done +fi + +if [ $stage -le 2 ]; then + local/prepare_dict.sh + + utils/prepare_lang.sh \ + data/local/dict "" data/local/lang data/lang + + local/train_lms_srilm.sh \ + --train-text data/train_worn/text --dev-text data/dev_worn/text \ + --oov-symbol "" --words-file data/lang/words.txt \ + data/ data/srilm +fi + +LM=data/srilm/best_3gram.gz +if [ $stage -le 3 ]; then + # Compiles G for chime5 trigram LM + utils/format_lm.sh \ + data/lang $LM data/local/dict/lexicon.txt data/lang + +fi + +if [ $stage -le 4 ]; then + # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details + utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up + grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text + utils/fix_data_dir.sh data/train_worn +fi + + +######################################################################################### +# In stages 5 and 6, we augment and fix train data for our training purpose. point source +# noises are extracted from chime corpus. Here we use 400k utterances from array microphones, +# its augmentation and all the worn set utterances in train. +######################################################################################### + +if [ $stage -le 5 ]; then + echo "$0: Extracting noise list from training data" + local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \ + local/distant_audio_list distant_noises + local/make_noise_list.py distant_noises > distant_noise_list + + noise_list=distant_noise_list + + echo "$0: Preparing simulated RIRs for data augmentation" + if [ ! -d RIRS_NOISES/ ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters $noise_list) + + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 1 \ + --source-sampling-rate 16000 \ + data/train_worn data/train_worn_rvb +fi + +if [ $stage -le 6 ]; then + # combine mix array and worn mics + # randomly extract first 400k utterances from all mics + # if you want to include more training data, you can increase the number of array mic utterances + utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06 + utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k + utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k + utils/combine_data.sh data/${sad_train_set} data/train_worn data/train_u400k +fi + +if [ $stage -le 7 ]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + utils/copy_data_dir.sh data/${train_set} data/${train_set}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${train_set}_nosplit data/${train_set} +fi + +################################################################################## +# Now make MFCC features. We use 13-dim MFCCs to train the GMM-HMM models. +################################################################################## + +if [ $stage -le 8 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + echo "$0: make features..." + mfccdir=mfcc + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc.conf \ + data/${train_set} exp/make_mfcc/${train_set} $mfccdir + steps/compute_cmvn_stats.sh data/${train_set} exp/make_mfcc/${train_set} $mfccdir + utils/fix_data_dir.sh data/${train_set} +fi + +################################################################################### +# Stages 9 to 14 train monophone and triphone models. They will be used for +# generating lattices for training the chain model and for obtaining targets +# for training the SAD system. +################################################################################### + +if [ $stage -le 9 ]; then + # make a subset for monophone training + utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort + utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort +fi + +if [ $stage -le 10 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_30kshort data/lang exp/mono +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 +fi + +if [ $stage -le 12 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 13 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 14 ]; then + # The following script cleans the data and produces cleaned data + steps/cleanup/clean_and_segment_data.sh --nj $nj --cmd "$train_cmd" \ + --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ + data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned +fi + +########################################################################## +# CHAIN MODEL TRAINING +# You can also download a pretrained chain ASR model using: +# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 15 ]; then + # chain TDNN + local/chain/run_tdnn.sh --nj $nj \ + --stage $nnet_stage \ + --train-set ${train_set}_cleaned \ + --test-sets "$test_sets" \ + --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb +fi + +########################################################################## +# SAD MODEL TRAINING +# You can also download a pretrained SAD model using: +# wget http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_sad_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 16 ]; then + local/train_sad.sh --stage $sad_stage --nj $nj \ + --data-dir data/${sad_train_set} --test-sets "${test_sets}" \ + --sat-model-dir exp/tri3_cleaned \ + --model-dir exp/tri2 +fi + +########################################################################## +# DIARIZATION MODEL TRAINING +# You can also download a pretrained diarization model using: +# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 17 ]; then + local/train_diarizer.sh --stage $diarizer_stage \ + --data-dir data/${train_set} \ + --model-dir exp/xvector_nnet_1a +fi + +########################################################################## +# DECODING: In track 2, we are given raw utterances without segment +# or speaker information, so we have to decode the whole pipeline, i.e., +# SAD -> Diarization -> ASR. This is done in the local/decode.sh +# script. +########################################################################## +if [ $stage -le 18 ]; then + local/decode.sh --stage $decode_stage \ + --enhancement $enhancement \ + --test-sets "$test_sets" +fi + +exit 0; + diff --git a/egs/chime6/s5_track2/sid b/egs/chime6/s5_track2/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/chime6/s5_track2/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/chime6/s5_track2/steps b/egs/chime6/s5_track2/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/chime6/s5_track2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/chime6/s5_track2/utils b/egs/chime6/s5_track2/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/chime6/s5_track2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index 20bcfd96d96..76025f4a388 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -46,6 +46,7 @@ overlap_duration=2.5 max_remaining_duration=5 # If the last remaining piece when splitting uniformly # is smaller than this duration, then the last piece # is merged with the previous. +remove_mismatch_frames=true # List of weights on labels obtained from alignment, # labels obtained from decoding and default labels in out-of-segment regions @@ -108,7 +109,7 @@ for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \ fi done -utils/validate_data_dir.sh $in_data_dir || exit 1 +utils/validate_data_dir.sh --no-feats $in_data_dir || exit 1 utils/validate_data_dir.sh --no-text $in_whole_data_dir || exit 1 if ! cat $garbage_phones_list $silence_phones_list | \ @@ -159,7 +160,7 @@ whole_data_dir=$dir/$whole_data_id # Obtain supervision-constrained lattices ############################################################################### sup_lats_dir=$dir/`basename ${ali_model_dir}`_sup_lats_${data_id} -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \ ${data_dir} ${lang} ${ali_model_dir} $sup_lats_dir || exit 1 fi @@ -170,7 +171,7 @@ fi uniform_seg_data_dir=$dir/${whole_data_id}_uniformseg_${max_segment_duration}sec uniform_seg_data_id=`basename $uniform_seg_data_dir` -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then utils/data/get_segments_for_data.sh ${whole_data_dir} > \ ${whole_data_dir}/segments @@ -193,7 +194,7 @@ model_id=$(basename $model_dir) ############################################################################### if [ -z "$graph_dir" ]; then graph_dir=$dir/$model_id/graph - if [ $stage -le 4 ]; then + if [ $stage -le 5 ]; then if [ ! -f $graph_dir/HCLG.fst ]; then rm -r $dir/lang_test 2>/dev/null || true cp -r $lang_test/ $dir/lang_test @@ -207,7 +208,7 @@ fi ############################################################################### model_id=$(basename $model_dir) decode_dir=$dir/${model_id}/decode_${uniform_seg_data_id} -if [ $stage -le 5 ]; then +if [ $stage -le 6 ]; then mkdir -p $decode_dir cp $model_dir/{final.mdl,final.mat,*_opts,tree} $dir/${model_id} @@ -228,7 +229,7 @@ ali_model_id=`basename $ali_model_dir` # The target values are obtained by summing up posterior probabilites of # arcs from lattice-arc-post over silence, speech and garbage phones. ############################################################################### -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \ --silence-phones "$silence_phones_list" \ --garbage-phones "$garbage_phones_list" \ @@ -237,7 +238,7 @@ if [ $stage -le 6 ]; then $dir/${ali_model_id}_${data_id}_sup_targets fi -if [ $stage -le 7 ]; then +if [ $stage -le 8 ]; then steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \ --silence-phones "$silence_phones_list" \ --garbage-phones "$garbage_phones_list" \ @@ -253,7 +254,7 @@ fi # for the manual segments, these are converted to whole recording-levels # by inserting [ 0 0 0 ] for the out-of-manual segment regions. ############################################################################### -if [ $stage -le 8 ]; then +if [ $stage -le 9 ]; then steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \ $data_dir $whole_data_dir \ $dir/${ali_model_id}_${data_id}_sup_targets \ @@ -268,7 +269,7 @@ fi ############################################################################### # Convert the targets from decoding to whole recording. ############################################################################### -if [ $stage -le 9 ]; then +if [ $stage -le 10 ]; then steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \ $dir/${uniform_seg_data_id} $whole_data_dir \ $dir/${model_id}_${uniform_seg_data_id}_targets \ @@ -285,7 +286,7 @@ fi # We assume in this setup that this is silence i.e. [ 1 0 0 ]. ############################################################################### -if [ $stage -le 10 ]; then +if [ $stage -le 11 ]; then echo " [ 1 0 0 ]" > $dir/default_targets.vec steps/segmentation/get_targets_for_out_of_segments.sh --cmd "$train_cmd" \ --nj $reco_nj --frame-subsampling-factor 3 \ @@ -301,9 +302,9 @@ fi # disagree (more than 0.5 probability on different classes), then those frames # are removed by setting targets to [ 0 0 0 ]. ############################################################################### -if [ $stage -le 11 ]; then +if [ $stage -le 12 ]; then steps/segmentation/merge_targets_dirs.sh --cmd "$train_cmd" --nj $reco_nj \ - --weights $merge_weights --remove-mismatch-frames true \ + --weights $merge_weights --remove-mismatch-frames $remove_mismatch_frames \ $whole_data_dir \ $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 \ $dir/${model_id}_${whole_data_id}_targets_sub3 \