From 92ad8baaafe6e7b24cdf86f5fad7967e8e9faab6 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Wed, 12 Oct 2016 05:01:53 -0400 Subject: [PATCH 1/9] Augmentation recipe for swbd --- .../chain/multi_condition/run_tdnn_7b.sh | 226 ++++++++++++++++++ .../nnet3/multi_condition/copy_ali_dir.sh | 78 ++++++ .../multi_condition/run_ivector_common.sh | 148 ++++++++++++ 3 files changed, 452 insertions(+) create mode 100755 egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh create mode 100755 egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh create mode 100755 egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh new file mode 100755 index 00000000000..7a8c08970e2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh @@ -0,0 +1,226 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix= +stage=1 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +iv_dir=exp/nnet3_rvb +num_data_reps=1 + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=2 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=625 +frames_per_eg=150 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <> $rvb_lat_dir/temp/combined_lats.scp + done + sort -u $rvb_lat_dir/temp/combined_lats.scp > $rvb_lat_dir/temp/combined_lats_sorted.scp + + lattice-copy scp:$rvb_lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$rvb_lat_dir/lat.1.gz" || exit 1; + echo "1" > $rvb_lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $lat_dir/$f $rvb_lat_dir/$f + done +fi + + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + # we build the tree using clean features (data/train) rather than + # the augmented features (data/train_rvb) to get better alignments + + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/train_nodup${suffix} $lang exp/tri4_ali_nodup${suffix} $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir $iv_dir/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $iv_dir/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir $rvb_lat_dir \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $iv_dir/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh b/egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh new file mode 100755 index 00000000000..42ea2dc4b9d --- /dev/null +++ b/egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Vijayaditya Peddinti) +# Apache 2.0 + +# This script operates on a directory, such as in exp/tri4a_ali, +# that contains some subset of the following files: +# ali.*.gz +# tree +# cmvn_opts +# splice_opts +# num_jobs +# final.mdl +# It copies to another directory, possibly adding a specified prefix or a suffix +# to the utterance names. + + +# begin configuration section +utt_prefix= +utt_suffix= +cmd=run.pl +# end configuration section + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --utt-prefix=1- exp/tri4a_ali exp/tri4a_rev1_ali" + echo "Options" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + + +export LC_ALL=C + +src_dir=$1 +dest_dir=$2 + +mkdir -p $dest_dir + +if [ ! -f $src_dir/ali.1.gz ]; then + echo "copy_ali_dir.sh: no such files $src_dir/ali.*.gz" + exit 1; +fi + +for f in tree cmvn_opts splice_opts num_jobs final.mdl; do + if [ ! -f $src_dir/$f ]; then + echo "copy_ali_dir.sh: no such file $src_dir/$f this might be serious error." + continue + fi + cp $src_dir/$f $dest_dir/ +done + +nj=$(cat $dest_dir/num_jobs) +mkdir -p $dest_dir/temp +cat << EOF > $dest_dir/temp/copy_ali.sh +set -e; +id=\$1 +echo "$src_dir/ali.\$id.gz" +gunzip -c $src_dir/ali.\$id.gz | \ + copy-int-vector ark:- ark,t:- | \ +python -c " +import sys +for line in sys.stdin: + parts = line.split() + print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) +" | \ + gzip -c >$dest_dir/ali.\$id.gz || exit 1; +set +o pipefail; # unset the pipefail option. +EOF +chmod +x $dest_dir/temp/copy_ali.sh +$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_ali.JOB.log $dest_dir/temp/copy_ali.sh JOB || exit 1; + +echo "$0: copied alignments from $src_dir to $dest_dir" diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh new file mode 100755 index 00000000000..126bf17b557 --- /dev/null +++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh @@ -0,0 +1,148 @@ +#!/bin/bash +#set -e +# this script is based on local/nnet3/run_ivector_common.sh +# but it operates on corrupted training/dev/test data sets + +. cmd.sh + +stage=1 +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" +num_data_reps=1 +clean_data_dir=train_nodup_sp +iv_dir=exp/nnet3_rvb + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +mkdir -p $iv_dir +train_set=${clean_data_dir}_rvb${num_data_reps} + +if [ $stage -le 1 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # corrupt the data to generate reverberated data + python steps/data/reverberate_data_dir.py \ + --prefix "rev" \ + --rir-set-parameters "0.25, RIRS_NOISES/simulated_rirs/smallroom/rir_list" \ + --rir-set-parameters "0.25, RIRS_NOISES/simulated_rirs/mediumroom/rir_list" \ + --rir-set-parameters "0.25, RIRS_NOISES/simulated_rirs/largeroom/rir_list" \ + --rir-set-parameters "0.25, RIRS_NOISES/real_rirs_isotropic_noises/rir_list" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 1 \ + --source-sampling-rate 8000 \ + data/${clean_data_dir} data/${train_set} +fi + + +if [ $stage -le 2 ]; then + mfccdir=mfcc_rvb + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $train_set; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${dataset}_hires + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires; + done +fi + + +# ivector extractor training +if [ $stage -le 5 ]; then + # Here we want to build a 200k system, half from the reverberated set and half from the original set + local/nnet3/multi_condition/copy_ali_dir.sh --utt-prefix "rev1_sp1.0-" exp/tri2_ali_100k_nodup exp/tri2_ali_100k_nodup_rvb || exit 1; + local/nnet3/multi_condition/copy_ali_dir.sh --utt-prefix "rev0_sp1.0-" exp/tri2_ali_100k_nodup exp/tri2_ali_100k_nodup_clean || exit 1; + + # want the 100k subset to exactly match train_100k, since we'll use its alignments. + awk -v p='rev1_sp1.0-' '{printf "%s%s\n", p, $1}' data/train_100k_nodup/utt2spk > uttlist + utils/subset_data_dir.sh --utt-list uttlist \ + data/${train_set}_hires data/${train_set}_100k_hires + rm uttlist + + # Mix the 100k original data and the 100k reverberated data + utils/copy_data_dir.sh --spk-prefix "rev0_sp1.0-" --utt-prefix "rev0_sp1.0-" data/train_100k_nodup_hires data/train_100k_nodup_hires_tmp + utils/combine_data.sh data/${train_set}_200k_mix_hires data/train_100k_nodup_hires_tmp data/${train_set}_100k_hires + rm -r data/train_100k_nodup_hires_tmp + + # combine the alignment for mixed data + steps/combine_ali_dirs.sh --num-jobs 30 data/${train_set}_200k_mix_hires exp/tri2_ali_200k_mix exp/tri2_ali_100k_nodup_clean exp/tri2_ali_100k_nodup_rvb || exit 1; + rm -r exp/tri2_ali_100k_nodup_clean exp/tri2_ali_100k_nodup_rvb + + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/${train_set}_200k_mix_hires \ + data/lang_nosp exp/tri2_ali_200k_mix $iv_dir/tri3b +fi + +if [ $stage -le 6 ]; then + utils/copy_data_dir.sh --spk-prefix "rev0_" --utt-prefix "rev0_" data/${clean_data_dir}_30k_nodup_hires data/${clean_data_dir}_30k_nodup_hires_tmp + # want the reverberated 30k subset to exactly match clean 30k, since we'll use its alignments. + awk -v p='rev1_' '{printf "%s%s\n", p, $1}' data/${clean_data_dir}_30k_nodup_hires/utt2spk > uttlist + utils/subset_data_dir.sh --utt-list uttlist \ + data/${train_set}_hires data/${train_set}_30k_hires + rm uttlist + + # Mix the 30k original data and the 30k reverberated data + utils/combine_data.sh data/${train_set}_60k_mix_hires data/${clean_data_dir}_30k_nodup_hires_tmp data/${train_set}_30k_hires + rm -r data/${clean_data_dir}_30k_nodup_hires_tmp + + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_60k_mix_hires 512 $iv_dir/tri3b $iv_dir/diag_ubm +fi + +if [ $stage -le 7 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${train_set}_200k_mix_hires $iv_dir/diag_ubm $iv_dir/extractor || exit 1; +fi + +if [ $stage -le 8 ]; then + # We extract iVectors on all the train_nodup data, which will be what we + # train the system on. + # handle per-utterance decoding well (iVector starts at zero). + + # Mix all the original data and all the reverberated data + utils/copy_data_dir.sh --spk-prefix "rev0_" --utt-prefix "rev0_" data/${clean_data_dir}_hires data/${clean_data_dir}_hires_clean + utils/combine_data.sh data/${train_set}_mix_hires data/${clean_data_dir}_hires_clean data/${train_set}_hires + + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_mix_hires data/${train_set}_mix_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_mix_max2_hires $iv_dir/extractor $iv_dir/ivectors_${train_set}_mix || exit 1; + + for data_set in eval2000; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${data_set}_hires $iv_dir/extractor $iv_dir/ivectors_$data_set || exit 1; + done +fi + +exit 0; + From 0bcf41ee1a8a6b9fb4ed9a16c130daf139b66992 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Wed, 12 Oct 2016 21:44:22 -0400 Subject: [PATCH 2/9] result added --- egs/swbd/s5c/RESULTS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index 6223c4ca319..471e088ffba 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -185,6 +185,13 @@ exit 0 %WER 10.4 | 1831 21395 | 90.7 6.1 3.2 1.2 10.4 44.6 | exp/chain/tdnn_7d_sp/decode_eval2000_sw1_fsh_fg/score_11_1.0/eval2000_hires.ctm.swbd.filt.sys %WER 11.6 | 1831 21395 | 89.7 7.0 3.3 1.4 11.6 47.0 | exp/chain/tdnn_7d_sp/decode_eval2000_sw1_tg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys + +# results with chain TDNNs (2 epoch training on data reverberated with room impulse responses) (see local/chain/multi_condition/run_tdnn_7b.sh) +%WER 10.0 | 1831 21395 | 91.0 6.0 3.0 1.1 10.0 43.8 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys +%WER 20.0 | 2628 21594 | 82.1 11.7 6.2 2.1 20.0 55.6 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 15.0 | 4459 42989 | 86.5 8.8 4.7 1.6 15.0 50.7 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys + + # current best 'chain' models with LSTM (see local/chain/run_lstm_d.sh) %WER 15.9 | 4459 42989 | 86.0 9.6 4.3 2.0 15.9 51.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys %WER 10.5 | 1831 21395 | 90.8 6.4 2.9 1.3 10.5 44.3 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys From 178c9d1aa7d91117afe1faed04cbab80cce92096 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Mon, 24 Oct 2016 22:55:08 -0400 Subject: [PATCH 3/9] remove copy_ali_dir.sh; add --include-original-data to reverberate script; modify swbd-rvb script --- egs/swbd/s5c/RESULTS | 6 +- .../chain/multi_condition/run_tdnn_7b.sh | 41 ++++--- .../nnet3/multi_condition/copy_ali_dir.sh | 78 ------------- .../multi_condition/run_ivector_common.sh | 105 +++++++++--------- egs/wsj/s5/steps/data/reverberate_data_dir.py | 61 ++++++---- 5 files changed, 124 insertions(+), 167 deletions(-) delete mode 100755 egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index 471e088ffba..e5bc3737c66 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -186,10 +186,10 @@ exit 0 %WER 11.6 | 1831 21395 | 89.7 7.0 3.3 1.4 11.6 47.0 | exp/chain/tdnn_7d_sp/decode_eval2000_sw1_tg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys -# results with chain TDNNs (2 epoch training on data reverberated with room impulse responses) (see local/chain/multi_condition/run_tdnn_7b.sh) -%WER 10.0 | 1831 21395 | 91.0 6.0 3.0 1.1 10.0 43.8 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys -%WER 20.0 | 2628 21594 | 82.1 11.7 6.2 2.1 20.0 55.6 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +# results with chain TDNNs (2 epoch training on data being speed-perturbed, volume-perturbed and reverberated with room impulse responses) (see local/chain/multi_condition/run_tdnn_7b.sh) %WER 15.0 | 4459 42989 | 86.5 8.8 4.7 1.6 15.0 50.7 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +%WER 10.0 | 1831 21395 | 91.0 6.0 3.0 1.1 10.0 43.8 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys +%WER 20.0 | 2628 21594 | 82.1 11.7 6.2 2.1 20.0 55.6 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/sddcore_10_0.0/eval2000_hires.ctm.callhm.filt.sys # current best 'chain' models with LSTM (see local/chain/run_lstm_d.sh) diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh index 7a8c08970e2..19dd29eae16 100755 --- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh +++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh @@ -4,14 +4,15 @@ set -e # configs for 'chain' affix= -stage=1 -train_stage=-10 +stage=12 +train_stage=$1 get_egs_stage=-10 speed_perturb=true dir=exp/chain/tdnn_7b # Note: _sp will get added to this if $speed_perturb == true. decode_iter= iv_dir=exp/nnet3_rvb num_data_reps=1 +clean_train_set=train_nodup # TDNN options # this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing @@ -57,30 +58,40 @@ fi # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. + +# if we are using the speed-perturbed data we need to generate +# alignments for it. +# Also the data reverberation will be done in this script/ +echo local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ + --clean-data-dir ${clean_train_set} \ + --iv-dir $iv_dir \ + --speed-perturb $speed_perturb \ + --num-data-reps $num_data_reps || exit 1; + + if [ "$speed_perturb" == "true" ]; then suffix=_sp fi -dir=${dir}${affix:+_$affix}${suffix}_rvb${num_data_reps}_mix -train_set=train_nodup${suffix}_rvb${num_data_reps}_mix +clean_train_set=${clean_train_set}${suffix} +dir=${dir}${affix:+_$affix}${suffix}_rvb${num_data_reps} +train_set=${clean_train_set}${suffix}_rvb${num_data_reps} lang=data/lang_chain_2y treedir=exp/chain/tri5_2y_tree${suffix} lat_dir=exp/tri4_lats_nodup${suffix} -rvb_lat_dir=${lat_dir}_rvb${num_data_reps}_mix - - -# if we are using the speed-perturbed data we need to generate -# alignments for it. -echo local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ - --clean-data-dir train_nodup${suffix} \ - --iv-dir $iv_dir \ - --num-data-reps $num_data_reps || exit 1; +rvb_lat_dir=${lat_dir}_rvb${num_data_reps} if [ $stage -le 9 ]; then # Get the alignments as lattices (gives the CTC training more freedom). # use the same num-jobs as the alignments - + nj=$(cat exp/tri4_ali_nodup${suffix}/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_data_dir} \ + data/lang exp/tri4 $lat_dir + rm $lat_dir/fsts.*.gz # save space + + + # Create the lattices for the reverberated data mkdir -p $rvb_lat_dir/temp/ lattice-copy "ark:gunzip -c $lat_dir/lat.*.gz |" ark,scp:$rvb_lat_dir/temp/lats.ark,$rvb_lat_dir/temp/lats.scp @@ -157,7 +168,6 @@ if [ $stage -le 12 ]; then fi - if [ $stage -le 13 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ @@ -191,7 +201,6 @@ if [ $stage -le 13 ]; then --tree-dir $treedir \ --lat-dir $rvb_lat_dir \ --dir $dir || exit 1; - fi if [ $stage -le 13 ]; then diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh b/egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh deleted file mode 100755 index 42ea2dc4b9d..00000000000 --- a/egs/swbd/s5c/local/nnet3/multi_condition/copy_ali_dir.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Vijayaditya Peddinti) -# Apache 2.0 - -# This script operates on a directory, such as in exp/tri4a_ali, -# that contains some subset of the following files: -# ali.*.gz -# tree -# cmvn_opts -# splice_opts -# num_jobs -# final.mdl -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance names. - - -# begin configuration section -utt_prefix= -utt_suffix= -cmd=run.pl -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --utt-prefix=1- exp/tri4a_ali exp/tri4a_rev1_ali" - echo "Options" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -src_dir=$1 -dest_dir=$2 - -mkdir -p $dest_dir - -if [ ! -f $src_dir/ali.1.gz ]; then - echo "copy_ali_dir.sh: no such files $src_dir/ali.*.gz" - exit 1; -fi - -for f in tree cmvn_opts splice_opts num_jobs final.mdl; do - if [ ! -f $src_dir/$f ]; then - echo "copy_ali_dir.sh: no such file $src_dir/$f this might be serious error." - continue - fi - cp $src_dir/$f $dest_dir/ -done - -nj=$(cat $dest_dir/num_jobs) -mkdir -p $dest_dir/temp -cat << EOF > $dest_dir/temp/copy_ali.sh -set -e; -id=\$1 -echo "$src_dir/ali.\$id.gz" -gunzip -c $src_dir/ali.\$id.gz | \ - copy-int-vector ark:- ark,t:- | \ -python -c " -import sys -for line in sys.stdin: - parts = line.split() - print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) -" | \ - gzip -c >$dest_dir/ali.\$id.gz || exit 1; -set +o pipefail; # unset the pipefail option. -EOF -chmod +x $dest_dir/temp/copy_ali.sh -$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_ali.JOB.log $dest_dir/temp/copy_ali.sh JOB || exit 1; - -echo "$0: copied alignments from $src_dir to $dest_dir" diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh index 126bf17b557..e99a7168c87 100755 --- a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh @@ -9,8 +9,9 @@ stage=1 foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" num_data_reps=1 -clean_data_dir=train_nodup_sp +clean_data_dir=train_nodup iv_dir=exp/nnet3_rvb +speed_perturb=true set -e . cmd.sh @@ -18,9 +19,45 @@ set -e . ./utils/parse_options.sh mkdir -p $iv_dir -train_set=${clean_data_dir}_rvb${num_data_reps} -if [ $stage -le 1 ]; then +if [ "$speed_perturb" == "true" ]; then + # perturbed data preparation + if [ $stage -le 1 ] && [ ! -d data/${clean_data_dir}_sp ]; then + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + + for datadir in ${clean_data_dir}; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 + utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${datadir}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + utils/fix_data_dir.sh data/${datadir}_sp + rm -r data/temp0 data/${datadir}_tmp + done + fi + + + if [ $stage -le 2 ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/${clean_data_dir}_sp data/lang_nosp exp/tri4 exp/tri4_ali_nodup_sp || exit 1 + fi + + clean_data_dir=${clean_data_dir}_sp +fi + + +if [ $stage -le 3 ]; then if [ ! -d "RIRS_NOISES" ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip @@ -28,6 +65,7 @@ if [ $stage -le 1 ]; then fi # corrupt the data to generate reverberated data + # this script modifies wav.scp to include the reverberation commands, the real computation will be done at the feature extraction python steps/data/reverberate_data_dir.py \ --prefix "rev" \ --rir-set-parameters "0.25, RIRS_NOISES/simulated_rirs/smallroom/rir_list" \ @@ -42,18 +80,19 @@ if [ $stage -le 1 ]; then --num-replications $num_data_reps \ --max-noises-per-minute 1 \ --source-sampling-rate 8000 \ - data/${clean_data_dir} data/${train_set} + --include-original-data true \ + data/${clean_data_dir} data/${clean_data_dir}_rvb${num_data_reps} fi -if [ $stage -le 2 ]; then +if [ $stage -le 4 ]; then mfccdir=mfcc_rvb if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then date=$(date +'%m_%d_%H_%M') utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage fi - for dataset in $train_set; do + for dataset in ${clean_data_dir}_rvb${num_data_reps}; do utils/copy_data_dir.sh data/$dataset data/${dataset}_hires # do volume-perturbation on the training data prior to extracting hires @@ -70,58 +109,28 @@ fi # ivector extractor training if [ $stage -le 5 ]; then - # Here we want to build a 200k system, half from the reverberated set and half from the original set - local/nnet3/multi_condition/copy_ali_dir.sh --utt-prefix "rev1_sp1.0-" exp/tri2_ali_100k_nodup exp/tri2_ali_100k_nodup_rvb || exit 1; - local/nnet3/multi_condition/copy_ali_dir.sh --utt-prefix "rev0_sp1.0-" exp/tri2_ali_100k_nodup exp/tri2_ali_100k_nodup_clean || exit 1; - - # want the 100k subset to exactly match train_100k, since we'll use its alignments. - awk -v p='rev1_sp1.0-' '{printf "%s%s\n", p, $1}' data/train_100k_nodup/utt2spk > uttlist - utils/subset_data_dir.sh --utt-list uttlist \ - data/${train_set}_hires data/${train_set}_100k_hires - rm uttlist - - # Mix the 100k original data and the 100k reverberated data - utils/copy_data_dir.sh --spk-prefix "rev0_sp1.0-" --utt-prefix "rev0_sp1.0-" data/train_100k_nodup_hires data/train_100k_nodup_hires_tmp - utils/combine_data.sh data/${train_set}_200k_mix_hires data/train_100k_nodup_hires_tmp data/${train_set}_100k_hires - rm -r data/train_100k_nodup_hires_tmp - - # combine the alignment for mixed data - steps/combine_ali_dirs.sh --num-jobs 30 data/${train_set}_200k_mix_hires exp/tri2_ali_200k_mix exp/tri2_ali_100k_nodup_clean exp/tri2_ali_100k_nodup_rvb || exit 1; - rm -r exp/tri2_ali_100k_nodup_clean exp/tri2_ali_100k_nodup_rvb - - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We use --num-iters 13 because after we get - # the transform (12th iter is the last), any further training is pointless. - # this decision is based on fisher_english steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ --splice-opts "--left-context=3 --right-context=3" \ - 5500 90000 data/${train_set}_200k_mix_hires \ - data/lang_nosp exp/tri2_ali_200k_mix $iv_dir/tri3b + 5500 90000 data/train_100k_nodup_hires \ + data/lang_nosp exp/tri2_ali_100k_nodup $iv_dir/tri3b fi -if [ $stage -le 6 ]; then - utils/copy_data_dir.sh --spk-prefix "rev0_" --utt-prefix "rev0_" data/${clean_data_dir}_30k_nodup_hires data/${clean_data_dir}_30k_nodup_hires_tmp - # want the reverberated 30k subset to exactly match clean 30k, since we'll use its alignments. - awk -v p='rev1_' '{printf "%s%s\n", p, $1}' data/${clean_data_dir}_30k_nodup_hires/utt2spk > uttlist - utils/subset_data_dir.sh --utt-list uttlist \ - data/${train_set}_hires data/${train_set}_30k_hires - rm uttlist - - # Mix the 30k original data and the 30k reverberated data - utils/combine_data.sh data/${train_set}_60k_mix_hires data/${clean_data_dir}_30k_nodup_hires_tmp data/${train_set}_30k_hires - rm -r data/${clean_data_dir}_30k_nodup_hires_tmp +train_set=${clean_data_dir}_rvb${num_data_reps} +if [ $stage -le 6 ]; then # To train a diagonal UBM we don't need very much data, so use the smallest subset. + utils/subset_data_dir.sh data/${train_set}_hires 30000 data/${train_set}_30k_hires steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ - data/${train_set}_60k_mix_hires 512 $iv_dir/tri3b $iv_dir/diag_ubm + data/${train_set}_30k_hires 512 $iv_dir/tri3b $iv_dir/diag_ubm fi if [ $stage -le 7 ]; then # iVector extractors can be sensitive to the amount of data, but this one has a # fairly small dim (defaults to 100) so we don't use all of it, we use just the # 100k subset (just under half the data). + utils/subset_data_dir.sh data/${train_set}_hires 100000 data/${train_set}_100k_hires steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/${train_set}_200k_mix_hires $iv_dir/diag_ubm $iv_dir/extractor || exit 1; + data/${train_set}_100k_hires $iv_dir/diag_ubm $iv_dir/extractor || exit 1; fi if [ $stage -le 8 ]; then @@ -129,14 +138,10 @@ if [ $stage -le 8 ]; then # train the system on. # handle per-utterance decoding well (iVector starts at zero). - # Mix all the original data and all the reverberated data - utils/copy_data_dir.sh --spk-prefix "rev0_" --utt-prefix "rev0_" data/${clean_data_dir}_hires data/${clean_data_dir}_hires_clean - utils/combine_data.sh data/${train_set}_mix_hires data/${clean_data_dir}_hires_clean data/${train_set}_hires - - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_mix_hires data/${train_set}_mix_max2_hires + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${train_set}_mix_max2_hires $iv_dir/extractor $iv_dir/ivectors_${train_set}_mix || exit 1; + data/${train_set}_max2_hires $iv_dir/extractor $iv_dir/ivectors_${train_set} || exit 1; for data_set in eval2000; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 890213475cd..72679406213 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -69,6 +69,8 @@ def GetArgs(): parser.add_argument('--source-sampling-rate', type=int, default=None, help="Sampling rate of the source data. If a positive integer is specified with this option, " "the RIRs/noises will be resampled to the rate of the source data.") + parser.add_argument("--include-original-data", type=str, help="If true, the output data includes one copy of the original data", + choices=['true', 'false'], default = False) parser.add_argument("input_dir", help="Input data directory") parser.add_argument("output_dir", @@ -85,11 +87,11 @@ def CheckArgs(args): if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) - ## Check arguments. - - if args.num_replicas > 1 and args.prefix is None: - args.prefix = "rvb" - warnings.warn("--prefix is set to 'rvb' as --num-replications is larger than 1.") + ## Check arguments + if args.prefix is None: + if args.num_replicas > 1 or args.include_original_data: + args.prefix = "rvb" + warnings.warn("--prefix is set to 'rvb' as more than one copy of data is generated") if not args.num_replicas > 0: raise Exception("--num-replications cannot be non-positive") @@ -180,13 +182,18 @@ def WriteDictToFile(dict, file_name): # This function creates the utt2uniq file from the utterance id in utt2spk file -def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, prefix): +def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix): corrupted_utt2uniq = {} # Parse the utt2spk to get the utterance id utt2spk = ParseFileToDict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x)) keys = utt2spk.keys() keys.sort() - for i in range(1, num_replicas+1): + if include_original: + start_index = 0 + else: + start_index = 1 + + for i in range(start_index, num_replicas+1): for utt_id in keys: new_utt_id = GetNewId(utt_id, prefix, i) corrupted_utt2uniq[new_utt_id] = utt_id @@ -314,6 +321,7 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal foreground_snr_array, # the SNR for adding the foreground noises background_snr_array, # the SNR for adding the background noises num_replicas, # Number of replicate to generated for the data + include_original, # include a copy of the original data prefix, # prefix for the id of the corrupted utterances speech_rvb_probability, # Probability of reverberating a speech signal shift_output, # option whether to shift the output waveform @@ -326,7 +334,12 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal corrupted_wav_scp = {} keys = wav_scp.keys() keys.sort() - for i in range(1, num_replicas+1): + if include_original: + start_index = 0 + else: + start_index = 1 + + for i in range(start_index, num_replicas+1): for recording_id in keys: wav_original_pipe = wav_scp[recording_id] # check if it is really a pipe @@ -346,8 +359,9 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal speech_dur, # duration of the recording max_noises_recording # Maximum number of point-source noises that can be added ) - - if reverberate_opts == "": + + # prefix with index 0, e.g. rvb0_swb0035, stangs for the original data + if reverberate_opts == "" or i == 0: wav_corrupted_pipe = "{0}".format(wav_original_pipe) else: wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts) @@ -359,10 +373,15 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal # This function replicate the entries in files like segments, utt2spk, text -def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0]): +def AddPrefixToFields(input_file, output_file, num_replicas, include_original, prefix, field = [0]): list = map(lambda x: x.strip(), open(input_file)) f = open(output_file, "w") - for i in range(1, num_replicas+1): + if include_original: + start_index = 0 + else: + start_index = 1 + + for i in range(start_index, num_replicas+1): for line in list: if len(line) > 0 and line[0] != ';': split1 = line.split() @@ -383,6 +402,7 @@ def CreateReverberatedCopy(input_dir, foreground_snr_string, # the SNR for adding the foreground noises background_snr_string, # the SNR for adding the background noises num_replicas, # Number of replicate to generated for the data + include_original, # include a copy of the original data prefix, # prefix for the id of the corrupted utterances speech_rvb_probability, # Probability of reverberating a speech signal shift_output, # option whether to shift the output waveform @@ -406,27 +426,26 @@ def CreateReverberatedCopy(input_dir, background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict, - foreground_snr_array, background_snr_array, num_replicas, prefix, + foreground_snr_array, background_snr_array, num_replicas, include_original, prefix, speech_rvb_probability, shift_output, isotropic_noise_addition_probability, pointsource_noise_addition_probability, max_noises_per_minute) - AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, prefix, field = [0,1]) + AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1]) data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" .format(output_dir = output_dir)) if os.path.isfile(input_dir + "/utt2uniq"): - AddPrefixToFields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, prefix, field =[0]) + AddPrefixToFields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0]) else: # Create the utt2uniq file - CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, prefix) - + CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix) if os.path.isfile(input_dir + "/text"): - AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, prefix, field =[0]) + AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0]) if os.path.isfile(input_dir + "/segments"): - AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replicas, prefix, field = [0,1]) + AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1]) if os.path.isfile(input_dir + "/reco2file_and_channel"): - AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, prefix, field = [0,1]) + AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1]) data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" .format(output_dir = output_dir)) @@ -597,6 +616,7 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None) pointsource_noise_list += SmoothProbabilityDistribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability) # ensure the point-source noise probabilities sum to 1 + pointsource_noise_list = SmoothProbabilityDistribution(pointsource_noise_list, smoothing_weight, 1.0) if len(pointsource_noise_list) > 0: assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0) @@ -629,6 +649,7 @@ def Main(): foreground_snr_string = args.foreground_snr_string, background_snr_string = args.background_snr_string, num_replicas = args.num_replicas, + include_original = args.include_original_data, prefix = args.prefix, speech_rvb_probability = args.speech_rvb_probability, shift_output = args.shift_output, From dc13729030961636d2016bb639cbf1cad6dfcb3b Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Tue, 25 Oct 2016 22:19:35 -0400 Subject: [PATCH 4/9] Add coments and fix typo --- .../chain/multi_condition/run_tdnn_7b.sh | 29 +++++++++---------- .../multi_condition/run_ivector_common.sh | 28 +++++++----------- 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh index 19dd29eae16..5a2ccdc971c 100755 --- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh +++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh @@ -4,14 +4,14 @@ set -e # configs for 'chain' affix= -stage=12 -train_stage=$1 +stage=1 +train_stage=-10 get_egs_stage=-10 speed_perturb=true dir=exp/chain/tdnn_7b # Note: _sp will get added to this if $speed_perturb == true. decode_iter= iv_dir=exp/nnet3_rvb -num_data_reps=1 +num_data_reps=1 # number of reverberated copies of data to generate clean_train_set=train_nodup # TDNN options @@ -59,21 +59,10 @@ fi # run those things. -# if we are using the speed-perturbed data we need to generate -# alignments for it. -# Also the data reverberation will be done in this script/ -echo local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ - --clean-data-dir ${clean_train_set} \ - --iv-dir $iv_dir \ - --speed-perturb $speed_perturb \ - --num-data-reps $num_data_reps || exit 1; - - if [ "$speed_perturb" == "true" ]; then suffix=_sp fi -clean_train_set=${clean_train_set}${suffix} dir=${dir}${affix:+_$affix}${suffix}_rvb${num_data_reps} train_set=${clean_train_set}${suffix}_rvb${num_data_reps} lang=data/lang_chain_2y @@ -82,11 +71,21 @@ lat_dir=exp/tri4_lats_nodup${suffix} rvb_lat_dir=${lat_dir}_rvb${num_data_reps} +# if we are using the speed-perturbed data we need to generate +# alignments for it. +# Also the data reverberation will be done in this script/ +echo local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ + --clean-data-dir ${clean_train_set} \ + --iv-dir $iv_dir \ + --speed-perturb $speed_perturb \ + --num-data-reps $num_data_reps || exit 1; + + if [ $stage -le 9 ]; then # Get the alignments as lattices (gives the CTC training more freedom). # use the same num-jobs as the alignments nj=$(cat exp/tri4_ali_nodup${suffix}/num_jobs) || exit 1; - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_data_dir} \ + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_train_set}${suffix} \ data/lang exp/tri4 $lat_dir rm $lat_dir/fsts.*.gz # save space diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh index e99a7168c87..58c4b4d0b64 100755 --- a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh @@ -1,14 +1,12 @@ #!/bin/bash #set -e -# this script is based on local/nnet3/run_ivector_common.sh -# but it operates on corrupted training/dev/test data sets +# This script is based on local/nnet3/run_ivector_common.sh. +# It reverberates the original data with simulated room impulse responses . cmd.sh stage=1 -foreground_snrs="20:10:15:5:0" -background_snrs="20:10:15:5:0" -num_data_reps=1 +num_data_reps=1 # number of reverberated copies of data to generate clean_data_dir=train_nodup iv_dir=exp/nnet3_rvb speed_perturb=true @@ -58,27 +56,21 @@ fi if [ $stage -le 3 ]; then - if [ ! -d "RIRS_NOISES" ]; then - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip + if [ ! -d "simulated_rirs_8k" ]; then + # Download the simulated RIR package with 8k sampling rate + wget --no-check-certificate http://www.openslr.org/resources/26/sim_rir_8k.zip + unzip sim_rir_8k.zip fi # corrupt the data to generate reverberated data # this script modifies wav.scp to include the reverberation commands, the real computation will be done at the feature extraction python steps/data/reverberate_data_dir.py \ --prefix "rev" \ - --rir-set-parameters "0.25, RIRS_NOISES/simulated_rirs/smallroom/rir_list" \ - --rir-set-parameters "0.25, RIRS_NOISES/simulated_rirs/mediumroom/rir_list" \ - --rir-set-parameters "0.25, RIRS_NOISES/simulated_rirs/largeroom/rir_list" \ - --rir-set-parameters "0.25, RIRS_NOISES/real_rirs_isotropic_noises/rir_list" \ - --foreground-snrs $foreground_snrs \ - --background-snrs $background_snrs \ + --rir-set-parameters "0.3, simulated_rirs_8k/smallroom/rir_list" \ + --rir-set-parameters "0.3, simulated_rirs_8k/mediumroom/rir_list" \ + --rir-set-parameters "0.3, simulated_rirs_8k/largeroom/rir_list" \ --speech-rvb-probability 1 \ - --pointsource-noise-addition-probability 1 \ - --isotropic-noise-addition-probability 1 \ --num-replications $num_data_reps \ - --max-noises-per-minute 1 \ --source-sampling-rate 8000 \ --include-original-data true \ data/${clean_data_dir} data/${clean_data_dir}_rvb${num_data_reps} From 82cd6f70e68d20028c64e6d68376ecf8a50851e9 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Thu, 27 Oct 2016 01:07:49 -0400 Subject: [PATCH 5/9] fix --include-original-data option in reverberate_data_dir.py --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 72679406213..017aedb05a3 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -70,7 +70,7 @@ def GetArgs(): help="Sampling rate of the source data. If a positive integer is specified with this option, " "the RIRs/noises will be resampled to the rate of the source data.") parser.add_argument("--include-original-data", type=str, help="If true, the output data includes one copy of the original data", - choices=['true', 'false'], default = False) + choices=['true', 'false'], default = "false") parser.add_argument("input_dir", help="Input data directory") parser.add_argument("output_dir", @@ -89,7 +89,7 @@ def CheckArgs(args): ## Check arguments if args.prefix is None: - if args.num_replicas > 1 or args.include_original_data: + if args.num_replicas > 1 or args.include_original_data == "true": args.prefix = "rvb" warnings.warn("--prefix is set to 'rvb' as more than one copy of data is generated") @@ -641,6 +641,11 @@ def Main(): print("Number of isotropic noises is {0}".format(sum(len(iso_noise_dict[key]) for key in iso_noise_dict.keys()))) room_dict = MakeRoomDict(rir_list) + if args.include_original_data == "true": + include_original = True + else: + include_original = False + CreateReverberatedCopy(input_dir = args.input_dir, output_dir = args.output_dir, room_dict = room_dict, @@ -649,7 +654,7 @@ def Main(): foreground_snr_string = args.foreground_snr_string, background_snr_string = args.background_snr_string, num_replicas = args.num_replicas, - include_original = args.include_original_data, + include_original = include_original, prefix = args.prefix, speech_rvb_probability = args.speech_rvb_probability, shift_output = args.shift_output, From a4ee796b480abe2c924a6dd7d833375cf995f38d Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Mon, 31 Oct 2016 23:43:42 -0400 Subject: [PATCH 6/9] adding run_tdnn_7g.sh which is the current best chain result --- egs/swbd/s5c/RESULTS | 31 +-- egs/swbd/s5c/local/chain/run_tdnn.sh | 2 +- .../s5c/local/chain/tuning/run_tdnn_7f.sh | 210 ++++++++++++++++++ .../run_tdnn_7b.sh => tuning/run_tdnn_7g.sh} | 123 +++++----- .../multi_condition/run_ivector_common.sh | 29 +-- 5 files changed, 312 insertions(+), 83 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh rename egs/swbd/s5c/local/chain/{multi_condition/run_tdnn_7b.sh => tuning/run_tdnn_7g.sh} (65%) diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index e5bc3737c66..f103200f966 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -152,13 +152,23 @@ exit 0 %WER 19.4 | 2628 21594 | 82.7 12.0 5.3 2.1 19.4 54.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys %WER 20.8 | 2628 21594 | 81.3 13.1 5.6 2.2 20.8 56.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys -# bidirectional LSTM with the same configuration as the above experiment, plus self-repair of all nonliearities and clipgradient activated -%WER 10.4 | 1831 21395 | 90.5 6.2 3.3 0.9 10.4 44.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys -%WER 11.3 | 1831 21395 | 89.8 6.8 3.3 1.1 11.3 46.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# bidirectional LSTM with the same configuration as the above experiment, with self-repair of all nonliearities and clipgradient, and max-change-per-component activated +%WER 14.9 | 4459 42989 | 86.7 9.0 4.3 1.6 14.9 50.5 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 15.9 | 4459 42989 | 85.7 9.8 4.5 1.7 15.9 52.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 10.2 | 1831 21395 | 90.8 6.1 3.2 1.0 10.2 44.4 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.2 | 1831 21395 | 89.9 6.8 3.3 1.1 11.2 46.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 19.4 | 2628 21594 | 82.7 11.8 5.4 2.2 19.4 54.5 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 20.6 | 2628 21594 | 81.5 12.8 5.7 2.2 20.6 56.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys + +( +# bidirectional LSTM with the same configuration as the above experiment, with self-repair of all nonliearities and clipgradient activated %WER 15.0 | 4459 42989 | 86.5 9.1 4.5 1.5 15.0 50.4 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys %WER 16.0 | 4459 42989 | 85.6 9.9 4.5 1.6 16.0 52.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 10.4 | 1831 21395 | 90.5 6.2 3.3 0.9 10.4 44.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.3 | 1831 21395 | 89.8 6.8 3.3 1.1 11.3 46.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys %WER 19.6 | 2628 21594 | 82.5 12.1 5.5 2.1 19.6 54.8 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys %WER 20.7 | 2628 21594 | 81.4 12.9 5.7 2.2 20.7 56.8 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +) # results with nnet3 tdnn: local/nnet3/run_tdnn.sh (11.10.2015) (2 epoch training on speed-perturbed and volume perturbed data) %WER 12.1 | 1831 21395 | 89.1 7.1 3.8 1.3 12.1 48.1 | exp/nnet3/tdnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys @@ -181,16 +191,11 @@ exit 0 %WER 24.3 | 2628 21594 | 78.6 15.0 6.4 2.9 24.3 60.0 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys -# current best 'chain' models with TDNNs (see local/chain/run_tdnn_7d.sh) -%WER 10.4 | 1831 21395 | 90.7 6.1 3.2 1.2 10.4 44.6 | exp/chain/tdnn_7d_sp/decode_eval2000_sw1_fsh_fg/score_11_1.0/eval2000_hires.ctm.swbd.filt.sys -%WER 11.6 | 1831 21395 | 89.7 7.0 3.3 1.4 11.6 47.0 | exp/chain/tdnn_7d_sp/decode_eval2000_sw1_tg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys - - -# results with chain TDNNs (2 epoch training on data being speed-perturbed, volume-perturbed and reverberated with room impulse responses) (see local/chain/multi_condition/run_tdnn_7b.sh) -%WER 15.0 | 4459 42989 | 86.5 8.8 4.7 1.6 15.0 50.7 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys -%WER 10.0 | 1831 21395 | 91.0 6.0 3.0 1.1 10.0 43.8 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys -%WER 20.0 | 2628 21594 | 82.1 11.7 6.2 2.1 20.0 55.6 | exp/chain/tdnn_7b_sp_rvb1_mix/decode_eval2000_sw1_fsh_fg/sddcore_10_0.0/eval2000_hires.ctm.callhm.filt.sys - +# current best 'chain' models with TDNNs (see local/chain/run_tdnn_7g.sh) +# (2 epoch training on data being speed-perturbed, volume-perturbed and reverberated with room impulse responses) +%WER 14.6 | 4459 42989 | 87.1 8.7 4.2 1.7 14.6 50.7 | exp/chain/tdnn_7g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 9.8 | 1831 21395 | 91.2 5.7 3.1 1.1 9.8 43.4 | exp/chain/tdnn_7g_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 19.3 | 2628 21594 | 83.0 11.5 5.5 2.3 19.3 55.8 | exp/chain/tdnn_7g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys # current best 'chain' models with LSTM (see local/chain/run_lstm_d.sh) %WER 15.9 | 4459 42989 | 86.0 9.6 4.3 2.0 15.9 51.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh index fd753c6faa5..4b80e886c66 120000 --- a/egs/swbd/s5c/local/chain/run_tdnn.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_7e.sh \ No newline at end of file +tuning/run_tdnn_7g.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh new file mode 100755 index 00000000000..5c47da1024f --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh @@ -0,0 +1,210 @@ +#!/bin/bash + +# 7e is as 7f, but adding the max-change-per-component to the neural net training +# which affects results slightly +# local/chain/compare_wer.sh 7e 7f +# System 7e 7f +# WER on train_dev(tg) 14.41 14.46 +# WER on train_dev(fg) 13.39 13.23 +# WER on eval2000(tg) 16.9 17.0 +# WER on eval2000(fg) 15.3 15.4 +# Final train prob -0.0853629 -0.0882071 +# Final valid prob -0.110972 -0.107545 +# Final train prob (xent) -1.25237 -1.26246 +# Final valid prob (xent) -1.36715 -1.35525 + + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7f # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=625 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + repair_opts=${self_repair_scale:+" --self-repair-scale-nonlinearity $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1 -1,0,1 -3,0,3 -3,0,3 -6,0,6 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh similarity index 65% rename from egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh rename to egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh index 5a2ccdc971c..2650e06fe5d 100755 --- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7b.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh @@ -1,5 +1,22 @@ #!/bin/bash +# 7g is as 7f, but reverberating the training data with room impulse responses +# which leads to better results +# This script assumes a mixing of the original training data with its reverberated copy +# and results in a 2-fold training set. Thus the number of epochs is halved to +# keep the same training time. +# local/chain/compare_wer.sh 7f 7g +# System 7f 7g +# WER on train_dev(tg) 14.46 14.27 +# WER on train_dev(fg) 13.23 13.16 +# WER on eval2000(tg) 17.0 16.3 +# WER on eval2000(fg) 15.4 14.6 +# Final train prob -0.0882071 -0.123325 +# Final valid prob -0.107545 -0.131798 +# Final train prob (xent) -1.26246 -1.6196 +# Final valid prob (xent) -1.35525 -1.60244 + + set -e # configs for 'chain' @@ -8,18 +25,16 @@ stage=1 train_stage=-10 get_egs_stage=-10 speed_perturb=true -dir=exp/chain/tdnn_7b # Note: _sp will get added to this if $speed_perturb == true. +dir=exp/chain/tdnn_7g # Note: _sp will get added to this if $speed_perturb == true. decode_iter= -iv_dir=exp/nnet3_rvb +ivector_dir=exp/nnet3_rvb num_data_reps=1 # number of reverberated copies of data to generate -clean_train_set=train_nodup +input_train_set=train_nodup + # TDNN options # this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing # smoothing options -pool_window= -pool_type='none' -pool_lpfilter_width= self_repair_scale=0.00001 # training options num_epochs=2 @@ -33,16 +48,14 @@ num_jobs_final=16 minibatch_size=128 relu_dim=625 frames_per_eg=150 -remove_egs=true +remove_egs=false common_egs_dir= xent_regularize=0.1 - - # End configuration section. echo "$0 $@" # Print the command line for logging -. cmd.sh +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -58,25 +71,27 @@ fi # nnet3 setup, and you can skip them by setting "--stage 8" if you have already # run those things. - +suffix= if [ "$speed_perturb" == "true" ]; then suffix=_sp fi -dir=${dir}${affix:+_$affix}${suffix}_rvb${num_data_reps} -train_set=${clean_train_set}${suffix}_rvb${num_data_reps} +dir=${dir}${affix:+_$affix}$suffix +clean_train_set=${input_train_set}${suffix} +train_set=${clean_train_set}_rvb${num_data_reps} +ali_dir=exp/tri4_ali_nodup$suffix +treedir=exp/chain/tri5_7d_tree$suffix lang=data/lang_chain_2y -treedir=exp/chain/tri5_2y_tree${suffix} -lat_dir=exp/tri4_lats_nodup${suffix} -rvb_lat_dir=${lat_dir}_rvb${num_data_reps} +clean_lat_dir=exp/tri4_lats_nodup${suffix} +lat_dir=${clean_lat_dir}_rvb${num_data_reps} # if we are using the speed-perturbed data we need to generate # alignments for it. -# Also the data reverberation will be done in this script/ -echo local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ - --clean-data-dir ${clean_train_set} \ - --iv-dir $iv_dir \ +# The data reverberation will be done in this script. +local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ + --input-data-dir ${input_train_set} \ + --ivector-dir $ivector_dir \ --speed-perturb $speed_perturb \ --num-data-reps $num_data_reps || exit 1; @@ -85,30 +100,30 @@ if [ $stage -le 9 ]; then # Get the alignments as lattices (gives the CTC training more freedom). # use the same num-jobs as the alignments nj=$(cat exp/tri4_ali_nodup${suffix}/num_jobs) || exit 1; - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_train_set}${suffix} \ - data/lang exp/tri4 $lat_dir - rm $lat_dir/fsts.*.gz # save space + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_train_set} \ + data/lang exp/tri4 $clean_lat_dir + rm $clean_lat_dir/fsts.*.gz # save space # Create the lattices for the reverberated data - mkdir -p $rvb_lat_dir/temp/ - lattice-copy "ark:gunzip -c $lat_dir/lat.*.gz |" ark,scp:$rvb_lat_dir/temp/lats.ark,$rvb_lat_dir/temp/lats.scp + mkdir -p $lat_dir/temp/ + lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp # copy the lattices for the reverberated data - rm -f $rvb_lat_dir/temp/combined_lats.scp - touch $rvb_lat_dir/temp/combined_lats.scp + rm -f $lat_dir/temp/combined_lats.scp + touch $lat_dir/temp/combined_lats.scp # Here prefix "rev0_" represents the clean set, "rev1_" represents the reverberated set for i in `seq 0 $num_data_reps`; do - cat $rvb_lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $rvb_lat_dir/temp/combined_lats.scp + cat $lat_dir/temp/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp done - sort -u $rvb_lat_dir/temp/combined_lats.scp > $rvb_lat_dir/temp/combined_lats_sorted.scp + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp - lattice-copy scp:$rvb_lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$rvb_lat_dir/lat.1.gz" || exit 1; - echo "1" > $rvb_lat_dir/num_jobs + lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1; + echo "1" > $lat_dir/num_jobs # copy other files from original lattice dir for f in cmvn_opts final.mdl splice_opts tree; do - cp $lat_dir/$f $rvb_lat_dir/$f + cp $clean_lat_dir/$f $lat_dir/$f done fi @@ -127,13 +142,12 @@ if [ $stage -le 10 ]; then fi if [ $stage -le 11 ]; then - # Build a tree using our new topology. - # we build the tree using clean features (data/train) rather than - # the augmented features (data/train_rvb) to get better alignments - + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate $leftmost_questions_truncate \ - --cmd "$train_cmd" 9000 data/train_nodup${suffix} $lang exp/tri4_ali_nodup${suffix} $treedir + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${clean_train_set} $lang $ali_dir $treedir fi if [ $stage -le 12 ]; then @@ -145,19 +159,15 @@ if [ $stage -le 12 ]; then fi # create the config files for nnet initialization - pool_opts= - pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} - pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} - pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} - repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + repair_opts=${self_repair_scale:+" --self-repair-scale-nonlinearity $self_repair_scale "} - steps/nnet3/tdnn/make_configs.py $pool_opts \ + steps/nnet3/tdnn/make_configs.py \ $repair_opts \ --feat-dir data/${train_set}_hires \ - --ivector-dir $iv_dir/ivectors_${train_set} \ + --ivector-dir $ivector_dir/ivectors_${train_set} \ --tree-dir $treedir \ $dim_opts \ - --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --splice-indexes "-1,0,1 -1,0,1 -1,0,1 -3,0,3 -3,0,3 -6,0,6 0" \ --use-presoftmax-prior-scale false \ --xent-regularize $xent_regularize \ --xent-separate-forward-affine true \ @@ -167,23 +177,23 @@ if [ $stage -le 12 ]; then fi + if [ $stage -le 13 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi - touch $dir/egs/.nodelete # keep egs around when that run dies. - - steps/nnet3/chain/train.py --stage $train_stage \ + steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ - --feat.online-ivector-dir $iv_dir/ivectors_${train_set} \ + --feat.online-ivector-dir $ivector_dir/ivectors_${train_set} \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $frames_per_eg \ @@ -198,29 +208,30 @@ if [ $stage -le 13 ]; then --cleanup.remove-egs $remove_egs \ --feat-dir data/${train_set}_hires \ --tree-dir $treedir \ - --lat-dir $rvb_lat_dir \ + --lat-dir $lat_dir \ --dir $dir || exit 1; + fi -if [ $stage -le 13 ]; then +if [ $stage -le 14 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg fi decode_suff=sw1_tg graph_dir=$dir/graph_sw1_tg -if [ $stage -le 14 ]; then +if [ $stage -le 15 ]; then iter_opts= if [ ! -z $decode_iter ]; then iter_opts=" --iter $decode_iter " fi - for decode_set in eval2000; do + for decode_set in train_dev eval2000; do ( steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$decode_cmd" $iter_opts \ - --online-ivector-dir $iv_dir/ivectors_${decode_set} \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $ivector_dir/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; if $has_fisher; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh index 58c4b4d0b64..6543b2b0366 100755 --- a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh @@ -7,8 +7,8 @@ stage=1 num_data_reps=1 # number of reverberated copies of data to generate -clean_data_dir=train_nodup -iv_dir=exp/nnet3_rvb +input_data_dir=train_nodup +ivector_dir=exp/nnet3_rvb speed_perturb=true set -e @@ -16,15 +16,15 @@ set -e . ./path.sh . ./utils/parse_options.sh -mkdir -p $iv_dir +mkdir -p $ivector_dir if [ "$speed_perturb" == "true" ]; then # perturbed data preparation - if [ $stage -le 1 ] && [ ! -d data/${clean_data_dir}_sp ]; then + if [ $stage -le 1 ] && [ ! -d data/${input_data_dir}_sp ]; then #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment # _sp stands for speed-perturbed - for datadir in ${clean_data_dir}; do + for datadir in ${input_data_dir}; do utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 @@ -48,10 +48,12 @@ if [ "$speed_perturb" == "true" ]; then if [ $stage -le 2 ]; then #obtain the alignment of the perturbed data steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/${clean_data_dir}_sp data/lang_nosp exp/tri4 exp/tri4_ali_nodup_sp || exit 1 + data/${input_data_dir}_sp data/lang_nosp exp/tri4 exp/tri4_ali_nodup_sp || exit 1 fi - clean_data_dir=${clean_data_dir}_sp + clean_data_dir=${input_data_dir}_sp +else + clean_data_dir=${input_data_dir} fi @@ -64,6 +66,7 @@ if [ $stage -le 3 ]; then # corrupt the data to generate reverberated data # this script modifies wav.scp to include the reverberation commands, the real computation will be done at the feature extraction + # if --include-original-data is true, the original data will be mixed with its reverberated copies python steps/data/reverberate_data_dir.py \ --prefix "rev" \ --rir-set-parameters "0.3, simulated_rirs_8k/smallroom/rir_list" \ @@ -104,7 +107,7 @@ if [ $stage -le 5 ]; then steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ --splice-opts "--left-context=3 --right-context=3" \ 5500 90000 data/train_100k_nodup_hires \ - data/lang_nosp exp/tri2_ali_100k_nodup $iv_dir/tri3b + data/lang_nosp exp/tri2_ali_100k_nodup $ivector_dir/tri3b fi train_set=${clean_data_dir}_rvb${num_data_reps} @@ -113,7 +116,7 @@ if [ $stage -le 6 ]; then # To train a diagonal UBM we don't need very much data, so use the smallest subset. utils/subset_data_dir.sh data/${train_set}_hires 30000 data/${train_set}_30k_hires steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ - data/${train_set}_30k_hires 512 $iv_dir/tri3b $iv_dir/diag_ubm + data/${train_set}_30k_hires 512 $ivector_dir/tri3b $ivector_dir/diag_ubm fi if [ $stage -le 7 ]; then @@ -122,7 +125,7 @@ if [ $stage -le 7 ]; then # 100k subset (just under half the data). utils/subset_data_dir.sh data/${train_set}_hires 100000 data/${train_set}_100k_hires steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/${train_set}_100k_hires $iv_dir/diag_ubm $iv_dir/extractor || exit 1; + data/${train_set}_100k_hires $ivector_dir/diag_ubm $ivector_dir/extractor || exit 1; fi if [ $stage -le 8 ]; then @@ -133,11 +136,11 @@ if [ $stage -le 8 ]; then steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${train_set}_max2_hires $iv_dir/extractor $iv_dir/ivectors_${train_set} || exit 1; + data/${train_set}_max2_hires $ivector_dir/extractor $ivector_dir/ivectors_${train_set} || exit 1; - for data_set in eval2000; do + for data_set in train_dev eval2000; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${data_set}_hires $iv_dir/extractor $iv_dir/ivectors_$data_set || exit 1; + data/${data_set}_hires $ivector_dir/extractor $ivector_dir/ivectors_$data_set || exit 1; done fi From 67673fa40f06364aea3f33e44b3f979e1193185c Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Fri, 4 Nov 2016 12:19:08 -0400 Subject: [PATCH 7/9] adding more comments the the script --- .../s5c/local/chain/tuning/run_tdnn_7f.sh | 7 ++-- .../s5c/local/chain/tuning/run_tdnn_7g.sh | 14 ++++--- .../multi_condition/run_ivector_common.sh | 40 ++++++++----------- egs/wsj/s5/steps/data/reverberate_data_dir.py | 2 +- 4 files changed, 30 insertions(+), 33 deletions(-) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh index 5c47da1024f..256373fc698 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh @@ -1,6 +1,6 @@ #!/bin/bash -# 7e is as 7f, but adding the max-change-per-component to the neural net training +# 7f is as 7e, but adding the max-change-per-component to the neural net training # which affects results slightly # local/chain/compare_wer.sh 7e 7f # System 7e 7f @@ -27,6 +27,7 @@ decode_iter= # TDNN options # this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-1,0,1 -1,0,1 -1,0,1 -3,0,3 -3,0,3 -6,0,6 0" # smoothing options self_repair_scale=0.00001 # training options @@ -84,7 +85,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \ if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the CTC training more freedom). + # Get the alignments as lattices (gives the LF-MMI training more freedom). # use the same num-jobs as the alignments nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1; steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \ @@ -132,7 +133,7 @@ if [ $stage -le 12 ]; then --ivector-dir exp/nnet3/ivectors_${train_set} \ --tree-dir $treedir \ $dim_opts \ - --splice-indexes "-1,0,1 -1,0,1 -1,0,1 -3,0,3 -3,0,3 -6,0,6 0" \ + --splice-indexes "$splice_indexes" \ --use-presoftmax-prior-scale false \ --xent-regularize $xent_regularize \ --xent-separate-forward-affine true \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh index 2650e06fe5d..cf1343e5041 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh @@ -4,7 +4,8 @@ # which leads to better results # This script assumes a mixing of the original training data with its reverberated copy # and results in a 2-fold training set. Thus the number of epochs is halved to -# keep the same training time. +# keep the same training time. The model converges after 2 epochs of training, +# The WER doesn't change much with more epochs of training. # local/chain/compare_wer.sh 7f 7g # System 7f 7g # WER on train_dev(tg) 14.46 14.27 @@ -34,6 +35,7 @@ input_train_set=train_nodup # TDNN options # this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-1,0,1 -1,0,1 -1,0,1 -3,0,3 -3,0,3 -6,0,6 0" # smoothing options self_repair_scale=0.00001 # training options @@ -86,8 +88,6 @@ clean_lat_dir=exp/tri4_lats_nodup${suffix} lat_dir=${clean_lat_dir}_rvb${num_data_reps} -# if we are using the speed-perturbed data we need to generate -# alignments for it. # The data reverberation will be done in this script. local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ --input-data-dir ${input_train_set} \ @@ -97,7 +97,7 @@ local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \ if [ $stage -le 9 ]; then - # Get the alignments as lattices (gives the CTC training more freedom). + # Get the alignments as lattices (gives the LF-MMI training more freedom). # use the same num-jobs as the alignments nj=$(cat exp/tri4_ali_nodup${suffix}/num_jobs) || exit 1; steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_train_set} \ @@ -106,6 +106,7 @@ if [ $stage -le 9 ]; then # Create the lattices for the reverberated data + # We use the lattices/alignments from the clean data for the reverberated data. mkdir -p $lat_dir/temp/ lattice-copy "ark:gunzip -c $clean_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp @@ -144,6 +145,7 @@ fi if [ $stage -le 11 ]; then # Build a tree using our new topology. This is the critically different # step compared with other recipes. + # we build the tree using the clean alignments as we empirically found that this was better. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate $leftmost_questions_truncate \ --context-opts "--context-width=2 --central-position=1" \ @@ -167,7 +169,7 @@ if [ $stage -le 12 ]; then --ivector-dir $ivector_dir/ivectors_${train_set} \ --tree-dir $treedir \ $dim_opts \ - --splice-indexes "-1,0,1 -1,0,1 -1,0,1 -3,0,3 -3,0,3 -6,0,6 0" \ + --splice-indexes "$splice_indexes" \ --use-presoftmax-prior-scale false \ --xent-regularize $xent_regularize \ --xent-separate-forward-affine true \ @@ -181,7 +183,7 @@ fi if [ $stage -le 13 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-reverb-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi steps/nnet3/chain/train.py --stage $train_stage \ diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh index 6543b2b0366..5f67e40d0f1 100755 --- a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh @@ -3,45 +3,38 @@ # This script is based on local/nnet3/run_ivector_common.sh. # It reverberates the original data with simulated room impulse responses -. cmd.sh +. ./cmd.sh -stage=1 +stage=3 num_data_reps=1 # number of reverberated copies of data to generate + # These will be combined with the original data. input_data_dir=train_nodup ivector_dir=exp/nnet3_rvb speed_perturb=true set -e -. cmd.sh +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh mkdir -p $ivector_dir +# Here we recommend speed perturbation as the gains are significant. +# The gain from speed perturbation is additive with the gain from data reverberation if [ "$speed_perturb" == "true" ]; then # perturbed data preparation - if [ $stage -le 1 ] && [ ! -d data/${input_data_dir}_sp ]; then - #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + if [ $stage -le 1 ] && [ ! -f data/${input_data_dir}_sp/feats.scp ]; then + # Although the nnet will be trained by high resolution data, we still have to prepare normal-resolution MFCC + # for purposes of getting alignments and/or lattices on the speed-perturbed data. # _sp stands for speed-perturbed - for datadir in ${input_data_dir}; do - utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 - utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 - utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 - utils/validate_data_dir.sh --no-feats data/${datadir}_tmp - rm -r data/temp1 data/temp2 - - mfccdir=mfcc_perturbed - steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ - data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - utils/fix_data_dir.sh data/${datadir}_tmp - - utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 - utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 - utils/fix_data_dir.sh data/${datadir}_sp - rm -r data/temp0 data/${datadir}_tmp - done + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${input_data_dir} data/${input_data_dir}_sp + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${input_data_dir}_sp exp/make_mfcc/${input_data_dir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${input_data_dir}_sp exp/make_mfcc/${input_data_dir}_sp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${input_data_dir}_sp fi @@ -66,6 +59,7 @@ if [ $stage -le 3 ]; then # corrupt the data to generate reverberated data # this script modifies wav.scp to include the reverberation commands, the real computation will be done at the feature extraction + # The script will automatically normalize the probability mass of the rir sets, so user just need to input the ratio of the sets # if --include-original-data is true, the original data will be mixed with its reverberated copies python steps/data/reverberate_data_dir.py \ --prefix "rev" \ diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 017aedb05a3..0083efa4939 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -360,7 +360,7 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal max_noises_recording # Maximum number of point-source noises that can be added ) - # prefix with index 0, e.g. rvb0_swb0035, stangs for the original data + # prefix using index 0 is reserved for original data e.g. rvb0_swb0035 corresponds to the swb0035 recording in original data if reverberate_opts == "" or i == 0: wav_corrupted_pipe = "{0}".format(wav_original_pipe) else: From 823bcac416a6c19f73038b0f448aa14d4db85ccf Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Mon, 7 Nov 2016 10:49:39 -0500 Subject: [PATCH 8/9] fixing typo --- .../s5c/local/nnet3/multi_condition/run_ivector_common.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh index 5f67e40d0f1..b5acdd27a3c 100755 --- a/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh +++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_ivector_common.sh @@ -5,7 +5,7 @@ . ./cmd.sh -stage=3 +stage=1 num_data_reps=1 # number of reverberated copies of data to generate # These will be combined with the original data. input_data_dir=train_nodup @@ -98,6 +98,8 @@ fi # ivector extractor training if [ $stage -le 5 ]; then + # Here it is good enough to train the lda_mllt transform with the clean data + # as it only affects the diagonal GMM which is just used to initialize the full GMM steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ --splice-opts "--left-context=3 --right-context=3" \ 5500 90000 data/train_100k_nodup_hires \ From 01e47f6c0e86879d7e2a77d5e49b14f3769e0dd6 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Mon, 7 Nov 2016 21:55:48 -0500 Subject: [PATCH 9/9] Moving tuning/run_tdnn_7g.sh back to multi_condition/run_tdnn_7f.sh --- .../run_tdnn_7f.sh} | 14 ++++++++------ egs/swbd/s5c/local/chain/run_tdnn.sh | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) rename egs/swbd/s5c/local/chain/{tuning/run_tdnn_7g.sh => multi_condition/run_tdnn_7f.sh} (94%) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh similarity index 94% rename from egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh rename to egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh index cf1343e5041..75b541b49e1 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh +++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh @@ -1,13 +1,15 @@ #!/bin/bash -# 7g is as 7f, but reverberating the training data with room impulse responses -# which leads to better results +# This script (multi_condition/run_tdnn_7f.sh) is the reverberated version of +# tuning/run_tdnn_7f.sh. It reverberates the training data with room impulse responses +# which leads to better results. +# (The reverberation of data is done in multi_condition/run_ivector_common.sh) # This script assumes a mixing of the original training data with its reverberated copy # and results in a 2-fold training set. Thus the number of epochs is halved to # keep the same training time. The model converges after 2 epochs of training, # The WER doesn't change much with more epochs of training. -# local/chain/compare_wer.sh 7f 7g -# System 7f 7g +# local/chain/compare_wer.sh tuning/7f multi_condition/7f +# System tuning/7f multi_condition/7f # WER on train_dev(tg) 14.46 14.27 # WER on train_dev(fg) 13.23 13.16 # WER on eval2000(tg) 17.0 16.3 @@ -26,7 +28,7 @@ stage=1 train_stage=-10 get_egs_stage=-10 speed_perturb=true -dir=exp/chain/tdnn_7g # Note: _sp will get added to this if $speed_perturb == true. +dir=exp/chain/tdnn_7f # Note: _sp will get added to this if $speed_perturb == true. decode_iter= ivector_dir=exp/nnet3_rvb num_data_reps=1 # number of reverberated copies of data to generate @@ -78,7 +80,7 @@ if [ "$speed_perturb" == "true" ]; then suffix=_sp fi -dir=${dir}${affix:+_$affix}$suffix +dir=${dir}${affix:+_$affix}${suffix}_rvb${num_data_reps} clean_train_set=${input_train_set}${suffix} train_set=${clean_train_set}_rvb${num_data_reps} ali_dir=exp/tri4_ali_nodup$suffix diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh index 4b80e886c66..669740d5f27 120000 --- a/egs/swbd/s5c/local/chain/run_tdnn.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_7g.sh \ No newline at end of file +tuning/run_tdnn_7f.sh \ No newline at end of file