diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh index 731b6721a78..de0a925a242 100755 --- a/egs/aspire/s5/local/run_asr_segmentation.sh +++ b/egs/aspire/s5/local/run_asr_segmentation.sh @@ -48,7 +48,6 @@ reco_nj=40 # test options test_nj=30 -test_stage=1 . ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi @@ -85,12 +84,10 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ exit 1 fi -data_id=$(basename $data_dir) whole_data_dir=${data_dir}_whole -targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3 +whole_data_id=$(basename $whole_data_dir) rvb_data_dir=${whole_data_dir}_rvb_hires -rvb_targets_dir=${targets_dir}_rvb if [ $stage -le 0 ]; then utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir @@ -101,26 +98,15 @@ fi ############################################################################### if [ $stage -le 1 ]; then steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ - $whole_data_dir exp/make_mfcc/${data_id}_whole - steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole + $whole_data_dir exp/make_mfcc/${whole_data_id} + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id} utils/fix_data_dir.sh $whole_data_dir fi ############################################################################### -# Get feats for the manual segments +# Prepare SAD targets for recordings ############################################################################### -if [ $stage -le 2 ]; then - if [ ! -f ${data_dir}/segments ]; then - utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments - fi - utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp - cp $data_dir/tmp/feats.scp $data_dir - - # Use recording as the "speaker". This is required by prepare_targets_gmm.sh script. - awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt -fi - +targets_dir=$dir/${whole_data_id}_combined_targets_sub3 if [ $stage -le 3 ]; then steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ @@ -132,6 +118,7 @@ if [ $stage -le 3 ]; then $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir fi +rvb_targets_dir=${targets_dir}_rvb if [ $stage -le 4 ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises if [ ! -f rirs_noises.zip ]; then @@ -164,30 +151,29 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ]; then - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj \ + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $reco_nj \ ${rvb_data_dir} steps/compute_cmvn_stats.sh ${rvb_data_dir} utils/fix_data_dir.sh $rvb_data_dir fi if [ $stage -le 6 ]; then - rvb_targets_dirs=() - for i in `seq 1 $num_data_reps`; do - steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ - $targets_dir ${targets_dir}_temp_$i || exit 1 - rvb_targets_dirs+=(${targets_dir}_temp_$i) - done - - steps/segmentation/combine_targets_dirs.sh \ - $rvb_data_dir ${rvb_targets_dir} \ - ${rvb_targets_dirs[@]} || exit 1; - - rm -r ${rvb_targets_dirs[@]} + rvb_targets_dirs=() + for i in `seq 1 $num_data_reps`; do + steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ + $targets_dir ${targets_dir}_temp_$i || exit 1 + rvb_targets_dirs+=(${targets_dir}_temp_$i) + done + + steps/segmentation/combine_targets_dirs.sh \ + $rvb_data_dir ${rvb_targets_dir} \ + ${rvb_targets_dirs[@]} || exit 1; + + rm -r ${rvb_targets_dirs[@]} fi -sad_nnet_dir=exp/segmentation${affix}/tdnn_stats_asr_sad_1a -#sad_nnet_dir=exp/segmentation${affix}/tdnn_lstm_asr_sad_1a -#sad_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" + +sad_nnet_dir=$dir/tdnn_stats_asr_sad_1a if [ $stage -le 7 ]; then # Train a STATS-pooling network for SAD @@ -216,6 +202,13 @@ fi chain_dir=exp/chain/tdnn_lstm_1a +# The context options in "sad_opts" must match the options used to train the +# SAD network in "sad_nnet_dir" +sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" + +# For LSTM SAD network, the options might be something like +# sad_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" + if [ $stage -le 9 ]; then # Use left and right context options that were used when training # the chain nnet diff --git a/egs/babel/s5d/local/run_asr_segmentation.sh b/egs/babel/s5d/local/run_asr_segmentation.sh index 7bfc3fd60ca..f70775526b6 100755 --- a/egs/babel/s5d/local/run_asr_segmentation.sh +++ b/egs/babel/s5d/local/run_asr_segmentation.sh @@ -35,11 +35,15 @@ merge_weights=1.0,0.1,0.5 prepare_targets_stage=-10 nstage=-10 train_stage=-10 -test_stage=-10 affix=_1a stage=-1 nj=80 +reco_nj=40 + +# test options +test_nj=32 +test_stage=-10 # Babel specific configuration. These two lines can be removed when adapting to other corpora. [ ! -f ./lang.conf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1 @@ -63,14 +67,14 @@ garbage_phones=" " silence_phones=" SIL" for p in $garbage_phones; do - for affix in "" "_B" "_E" "_I" "_S"; do - echo "$p$affix" + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" done done > $dir/garbage_phones.txt for p in $silence_phones; do - for affix in "" "_B" "_E" "_I" "_S"; do - echo "$p$affix" + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" done done > $dir/silence_phones.txt @@ -81,6 +85,7 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ fi whole_data_dir=${data_dir}_whole +whole_data_id=$(basename $whole_data_dir) if [ $stage -le 0 ]; then utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir @@ -91,39 +96,34 @@ fi ############################################################################### if [ $stage -le 1 ]; then if $use_pitch; then - steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \ + steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $reco_nj --write-utt2num-frames true \ ${whole_data_dir} || exit 1 else - steps/make_plp.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \ + steps/make_plp.sh --cmd "$train_cmd" --nj $reco_nj --write-utt2num-frames true \ ${whole_data_dir} || exit 1 fi + steps/compute_cmvn_stats.sh $whole_data_dir + utils/fix_data_dir.sh $whole_data_dir fi ############################################################################### -# Get feats for the manual segments +# Prepare SAD targets for recordings ############################################################################### -if [ $stage -le 2 ]; then - if [ ! -f ${data_dir}/segments ]; then - utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments - fi - utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp - cp $data_dir/tmp/feats.scp $data_dir - awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt -fi - +targets_dir=$dir/${whole_data_id}_combined_targets_sub3 if [ $stage -le 3 ]; then steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ - --nj 80 --reco-nj 40 --lang-test $lang_test \ + --nj $nj --reco-nj $reco_nj --lang-test $lang_test \ --garbage-phones-list $dir/garbage_phones.txt \ --silence-phones-list $dir/silence_phones.txt \ + --merge-weights "$merge_weights" \ + --graph-dir "$graph_dir" \ $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir fi if [ $stage -le 4 ]; then utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires_bp - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj 40 \ + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj $reco_nj \ ${whole_data_dir}_hires_bp steps/compute_cmvn_stats.sh ${whole_data_dir}_hires_bp fi @@ -132,7 +132,7 @@ if [ $stage -le 5 ]; then # Train a TDNN-LSTM network for SAD local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ - --targets-dir $dir \ + --targets-dir $targets_dir \ --data-dir ${whole_data_dir}_hires_bp fi @@ -147,7 +147,7 @@ if [ $stage -le 6 ]; then steps/segmentation/detect_speech_activity.sh \ --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --nj 32 --acwt 0.3 --stage $test_stage \ + --nj $test_nj --acwt 0.3 --stage $test_stage \ data/dev10h.pem \ exp/segmentation_1a/tdnn_lstm_asr_sad_1a \ mfcc_hires_bp \ diff --git a/egs/chime5/s5/local/train_lms_srilm.sh b/egs/chime5/s5/local/train_lms_srilm.sh index 8caa251fa35..5a1d56d24b3 100755 --- a/egs/chime5/s5/local/train_lms_srilm.sh +++ b/egs/chime5/s5/local/train_lms_srilm.sh @@ -99,7 +99,7 @@ fi # Kaldi transcript files contain Utterance_ID as the first word; remove it # We also have to avoid skewing the LM by incorporating the same sentences # from different channels -sed -e "s/\.CH.//" -e "s/_.\-./_/" $train_text | sort -u | \ +sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \ perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt if (($?)); then echo "Failed to create $tgtdir/train.txt from $train_text" diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh index 4a39dfb66ac..2f050be93f2 120000 --- a/egs/swbd/s5c/local/chain/run_tdnn.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_7n.sh \ No newline at end of file +tuning/run_tdnn_7o.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh new file mode 100755 index 00000000000..753dfc632ba --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh @@ -0,0 +1,297 @@ +#!/bin/bash + + +# 7o is as 7n but with a bunch of tuning changes affecting both the structure +# and the learning rates/l2 regularization. Structurally the main change is +# that we also do splicing via an extra layer whose input and output are in the +# "small" dim (256); this increases the left and right context. We also change +# the orthonormal-constraint to be "floating" meaning it doesn't constrain the +# size of the matrix (the value orthonormal-constraint=-1 is interpreted +# specially by the code), which means we can control how fast these constrained +# layers learn layers via l2, just like the unconstrained layers. Also the l2 +# values were increased and the learning rates were decreased; there are +# more epochs (6->8); and the dimension of some of the layers (the ones that +# are subsampled and which don't receive skip-splicing) was increased from +# 1280 to 1536. The config is a bit messy and I'd like to find a way to +# encapsulate things a bit better; treat this as a work in progress. +# +# +# +# local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp +# System tdnn7n_sp tdnn7m26j_sp +# WER on train_dev(tg) 12.18 11.74 +# WER on train_dev(fg) 11.12 10.69 +# WER on eval2000(tg) 14.9 14.6 +# WER on eval2000(fg) 13.5 13.1 +# WER on rt03(tg) 18.4 17.5 +# WER on rt03(fg) 16.2 15.4 +# Final train prob -0.077 -0.070 +# Final valid prob -0.093 -0.084 +# Final train prob (xent) -0.994 -0.883 +# Final valid prob (xent) -1.0194 -0.9110 +# Num-parameters 20111396 22865188 + + +# exp/chain/tdnn7o_sp: num-iters=525 nj=3..16 num-params=22.9M dim=40+100->6034 combine=-0.074->-0.073 (over 7) xent:train/valid[348,524,final]=(-1.20,-0.884,-0.883/-1.24,-0.918,-0.911) logprob:train/valid[348,524,final]=(-0.100,-0.071,-0.070/-0.115,-0.086,-0.084) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7o +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + +# --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + + + steps/nnet3/chain/train.py --stage $train_stage \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 8 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0005 \ + --trainer.optimization.final-effective-lrate 0.00005 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index af7d3a428ba..6da8b55631d 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -38,11 +38,15 @@ merge_weights=1.0,0.1,0.5 prepare_targets_stage=-10 nstage=-10 train_stage=-10 -test_stage=-10 num_data_reps=2 affix=_1a # For segmentation stage=-1 nj=80 +reco_nj=40 + +# test options +test_stage=-10 +test_nj=32 . ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi @@ -79,12 +83,10 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ exit 1 fi -data_id=$(basename $data_dir) whole_data_dir=${data_dir}_whole -targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3 +whole_data_id=$(basename $whole_data_dir) rvb_data_dir=${whole_data_dir}_rvb_hires -rvb_targets_dir=${targets_dir}_rvb if [ $stage -le 0 ]; then utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir @@ -94,29 +96,20 @@ fi # Extract features for the whole data directory ############################################################################### if [ $stage -le 1 ]; then - steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" --write-utt2num-frames true \ - $whole_data_dir exp/make_mfcc/${data_id}_whole - steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + $whole_data_dir exp/make_mfcc/${whole_data_id} + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id} utils/fix_data_dir.sh $whole_data_dir fi ############################################################################### -# Get feats for the manual segments +# Prepare SAD targets for recordings ############################################################################### -if [ $stage -le 2 ]; then - if [ ! -f ${data_dir}/segments ]; then - utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments - fi - utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp - cp $data_dir/tmp/feats.scp $data_dir - awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt -fi - +targets_dir=$dir/${whole_data_id}_combined_targets_sub3 if [ $stage -le 3 ]; then steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ - --nj 80 --reco-nj 40 --lang-test $lang_test \ + --nj $nj --reco-nj $reco_nj --lang-test $lang_test \ --garbage-phones-list $dir/garbage_phones.txt \ --silence-phones-list $dir/silence_phones.txt \ --merge-weights "$merge_weights" \ @@ -124,6 +117,7 @@ if [ $stage -le 3 ]; then $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir fi +rvb_targets_dir=${targets_dir}_rvb if [ $stage -le 4 ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises if [ ! -f rirs_noises.zip ]; then @@ -156,7 +150,7 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ]; then - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \ + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $reco_nj \ ${rvb_data_dir} steps/compute_cmvn_stats.sh ${rvb_data_dir} utils/fix_data_dir.sh $rvb_data_dir @@ -196,7 +190,7 @@ if [ $stage -le 8 ]; then steps/segmentation/detect_speech_activity.sh \ --extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --nj 32 --acwt 0.3 --stage $test_stage \ + --nj $test_nj --acwt 0.3 --stage $test_stage \ data/eval2000 \ exp/segmentation${affix}/tdnn_stats_asr_sad_1a \ mfcc_hires \ diff --git a/egs/vystadial_cz/s5b/RESULTS b/egs/vystadial_cz/s5b/RESULTS new file mode 100644 index 00000000000..ec945059fc5 --- /dev/null +++ b/egs/vystadial_cz/s5b/RESULTS @@ -0,0 +1,17 @@ + +# monophone system (shortest 10k) +%WER 75.81 [ 8989 / 11858, 421 ins, 2691 del, 5877 sub ] exp/mono/decode_dev/wer_10_0.0 +# delta + delta-delta triphone system +%WER 55.97 [ 6637 / 11858, 494 ins, 1664 del, 4479 sub ] exp/tri1/decode_dev/wer_14_0.5 +# LDA+MLLT system +%WER 50.98 [ 6045 / 11858, 439 ins, 1564 del, 4042 sub ] exp/tri2b/decode_dev/wer_12_0.5 +# LDA+MLLT+SAT system +%WER 51.76 [ 6138 / 11858, 627 ins, 1276 del, 4235 sub ] exp/tri3b/decode_dev/wer_12_0.0 +# LDA+MLLT+SAT system with silence probabilities +%WER 51.75 [ 6137 / 11858, 622 ins, 1282 del, 4233 sub ] exp/tri3b/decode_sp_dev/wer_12_0.5 + +# chain tdnn system +%WER 33.29 [ 3948 / 11858, 480 ins, 787 del, 2681 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_1.0 +%WER 32.15 [ 3735 / 11617, 394 ins, 803 del, 2538 sub ] exp/chain/tdnn1a_sp/decode_test/wer_11_0.5 +%WER 33.20 [ 3937 / 11858, 514 ins, 734 del, 2689 sub ] exp/chain/tdnn1a_sp_online/decode_dev/wer_10_0.5 +%WER 32.04 [ 3722 / 11617, 451 ins, 723 del, 2548 sub ] exp/chain/tdnn1a_sp_online/decode_test/wer_10_0.5 diff --git a/egs/vystadial_cz/s5b/cmd.sh b/egs/vystadial_cz/s5b/cmd.sh new file mode 100644 index 00000000000..71dd849a93b --- /dev/null +++ b/egs/vystadial_cz/s5b/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/vystadial_cz/s5b/conf/decode.config b/egs/vystadial_cz/s5b/conf/decode.config new file mode 100644 index 00000000000..7ba966f2b83 --- /dev/null +++ b/egs/vystadial_cz/s5b/conf/decode.config @@ -0,0 +1 @@ +# empty config, just use the defaults. diff --git a/egs/vystadial_cz/s5b/conf/mfcc.conf b/egs/vystadial_cz/s5b/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/vystadial_cz/s5b/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/vystadial_cz/s5b/conf/mfcc_hires.conf b/egs/vystadial_cz/s5b/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/vystadial_cz/s5b/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/vystadial_cz/s5b/conf/online_cmvn.conf b/egs/vystadial_cz/s5b/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/vystadial_cz/s5b/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/vystadial_cz/s5b/env_voip_cs.sh b/egs/vystadial_cz/s5b/env_voip_cs.sh new file mode 120000 index 00000000000..7adc3c6960f --- /dev/null +++ b/egs/vystadial_cz/s5b/env_voip_cs.sh @@ -0,0 +1 @@ +../s5/env_voip_cs.sh \ No newline at end of file diff --git a/egs/vystadial_cz/s5b/local/chain/compare_wer.sh b/egs/vystadial_cz/s5b/local/chain/compare_wer.sh new file mode 100755 index 00000000000..14ca1196e64 --- /dev/null +++ b/egs/vystadial_cz/s5b/local/chain/compare_wer.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copied from egs/mini_librispeech/s5/local/chain/compare_wer.sh (commit 421a062477d732fc02e2109b9d50857ae0f18661) + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev" + "#WER test") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev test) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/vystadial_cz/s5b/local/chain/run_tdnn.sh b/egs/vystadial_cz/s5b/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/vystadial_cz/s5b/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..496ee5e84ca --- /dev/null +++ b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,293 @@ +#!/bin/bash + +# Adapted from egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh + +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1a_sp_online +# System tdnn1a_sp tdnn1a_sp_online +#WER dev 33.29 33.20 +#WER test 32.15 32.04 +# Final train prob -0.0988 +# Final valid prob -0.1913 +# Final train prob (xent) -1.6242 +# Final valid prob (xent) -1.9833 +# Num-params 6117328 + +# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp +# exp/chain/tdnn1a_sp: num-iters=38 nj=2..12 num-params=6.1M dim=40+100->2024 combine=-0.116->-0.115 (over 2) xent:train/valid[24,37,final]=(-1.89,-1.65,-1.62/-2.17,-2.01,-1.98) logprob:train/valid[24,37,final]=(-0.134,-0.105,-0.099/-0.206,-0.196,-0.191) + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets="dev test" +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang_sp $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05 dropout-per-dim-continuous=true" + output_opts="l2-regularize=0.02 bottleneck-dim=192" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=15 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_sp_test \ + $tree_dir $tree_dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l &2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh + +# SRILM is needed for LM model building +SRILM_ROOT=$KALDI_ROOT/tools/srilm +SRILM_PATH=$SRILM_ROOT/bin:$SRILM_ROOT/bin/i686-m64 +export PATH=$PATH:$SRILM_PATH + +export LC_ALL=C + diff --git a/egs/vystadial_cz/s5b/run.sh b/egs/vystadial_cz/s5b/run.sh new file mode 100755 index 00000000000..f837b273466 --- /dev/null +++ b/egs/vystadial_cz/s5b/run.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# Change this location to somewhere where you want to put the data. +data=$HOME/vystadial_cz + +# Load training parameters +. ./env_voip_cs.sh + +. ./cmd.sh +. ./path.sh + +stage=0 +. utils/parse_options.sh + +set -euo pipefail + +mkdir -p $data + +if [ $stage -le 0 ]; then + local/download_cs_data.sh $data || exit 1; +fi + +lm="build3" + +if [ $stage -le 1 ]; then + local/data_split.sh --every_n 1 $data data "$lm" "dev test" + + local/create_LMs.sh data/local data/train/trans.txt \ + data/test/trans.txt data/local/lm "$lm" + + gzip data/local/lm/$lm + + local/prepare_cs_transcription.sh data/local data/local/dict + + local/create_phone_lists.sh data/local/dict + + utils/prepare_lang.sh data/local/dict '_SIL_' data/local/lang data/lang + + utils/format_lm.sh data/lang data/local/lm/$lm.gz data/local/dict/lexicon.txt data/lang_test + + for part in dev test train; do + mv data/$part/trans.txt data/$part/text + done +fi + +if [ $stage -le 2 ]; then + mfccdir=mfcc + + for part in dev train; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$part exp/make_mfcc/$part $mfccdir + steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir + done + + # Get the shortest 10000 utterances first because those are more likely + # to have accurate alignments. + utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort +fi + +# train a monophone system +if [ $stage -le 3 ]; then + steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train_10kshort data/lang exp/mono + ( + utils/mkgraph.sh data/lang_test \ + exp/mono exp/mono/graph + for test in dev; do + steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono/graph \ + data/$test exp/mono/decode_$test + done + )& + + steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train data/lang exp/mono exp/mono_ali_train +fi + +# train a first delta + delta-delta triphone system on all utterances +if [ $stage -le 4 ]; then + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/train data/lang exp/mono_ali_train exp/tri1 + + # decode using the tri1 model + ( + utils/mkgraph.sh data/lang_test \ + exp/tri1 exp/tri1/graph + for test in dev; do + steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri1/graph \ + data/$test exp/tri1/decode_$test + done + )& + + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + data/train data/lang exp/tri1 exp/tri1_ali_train +fi + +# train an LDA+MLLT system. +if [ $stage -le 5 ]; then + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ + data/train data/lang exp/tri1_ali_train exp/tri2b + + # decode using the LDA+MLLT model + ( + utils/mkgraph.sh data/lang_test \ + exp/tri2b exp/tri2b/graph + for test in dev; do + steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri2b/graph \ + data/$test exp/tri2b/decode_$test + done + )& + + # Align utts using the tri2b model + steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ + data/train data/lang exp/tri2b exp/tri2b_ali_train +fi + +# Train tri3b, which is LDA+MLLT+SAT +if [ $stage -le 6 ]; then + steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \ + data/train data/lang exp/tri2b_ali_train exp/tri3b + + # decode using the tri3b model + ( + utils/mkgraph.sh data/lang_test \ + exp/tri3b exp/tri3b/graph + for test in dev; do + steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri3b/graph data/$test \ + exp/tri3b/decode_$test + done + )& +fi + +# Now we compute the pronunciation and silence probabilities from training data, +# and re-create the lang directory. +if [ $stage -le 7 ]; then + steps/get_prons.sh --cmd "$train_cmd" \ + data/train data/lang exp/tri3b + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict \ + exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \ + exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_sp + + utils/prepare_lang.sh data/local/dict_sp "_SIL_" data/local/lang_tmp data/lang_sp + + utils/format_lm.sh data/lang_sp data/local/lm/$lm.gz data/local/dict_sp/lexicon.txt data/lang_sp_test + + steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \ + data/train data/lang_sp exp/tri3b exp/tri3b_ali_train_sp +fi + +if [ $stage -le 8 ]; then + # Test the tri3b system with the silprobs and pron-probs. + + # decode using the tri3b model + utils/mkgraph.sh data/lang_sp_test \ + exp/tri3b exp/tri3b/graph_sp + + for test in dev; do + steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri3b/graph_sp data/$test \ + exp/tri3b/decode_sp_$test + done +fi + +# Train a chain model +if [ $stage -le 9 ]; then + local/chain/run_tdnn.sh --stage 0 +fi + +# Don't finish until all background decoding jobs are finished. +wait diff --git a/egs/vystadial_cz/s5b/steps b/egs/vystadial_cz/s5b/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/vystadial_cz/s5b/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/vystadial_cz/s5b/utils b/egs/vystadial_cz/s5b/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/vystadial_cz/s5b/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 4c6a37fb837..b20c64ab9ba 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -941,9 +941,10 @@ def __init__(self, action=common_lib.NullstrToNoneAction, help="Script to launch egs jobs") self.parser.add_argument("--use-gpu", type=str, - action=common_lib.StrToBoolAction, - choices=["true", "false"], - help="Use GPU for training", default=True) + choices=["true", "false", "yes", "no", "wait"], + help="Use GPU for training. " + "Note 'true' and 'false' are deprecated.", + default="yes") self.parser.add_argument("--cleanup", type=str, action=common_lib.StrToBoolAction, choices=["true", "false"], default=True, diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 6fbde1fbbcc..99911b39fb2 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -69,6 +69,7 @@ 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, 'renorm-component': xlayers.XconfigRenormComponent, + 'batchnorm-component': xlayers.XconfigBatchnormComponent, 'no-op-component': xlayers.XconfigNoOpComponent, 'linear-component': xlayers.XconfigLinearComponent } diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 63f6278d1ca..f7da8956d1c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -16,7 +16,7 @@ class XconfigRenormComponent(XconfigLayerBase): """This class is for parsing lines like - 'renorm-component name=renorm input=Append(-3,0,3)' + 'renorm-component name=renorm1 input=Append(-3,0,3)' which will produce just a single component, of type NormalizeComponent. Parameters of the class, and their defaults: @@ -70,9 +70,65 @@ def _generate_config(self): return configs +class XconfigBatchnormComponent(XconfigLayerBase): + """This class is for parsing lines like + 'batchnorm-component name=batchnorm input=Append(-3,0,3)' + which will produce just a single component, of type BatchNormComponent. + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + target-rms=1.0 [The target RMS of the BatchNormComponent] + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'target-rms': 1.0 } + + def check_configs(self): + assert self.config['target-rms'] > 0.0 + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + target_rms = self.config['target-rms'] + + configs = [] + line = ('component name={0} type=BatchNormComponent dim={1} target-rms={2}'.format( + self.name, input_dim, target_rms)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + class XconfigNoOpComponent(XconfigLayerBase): """This class is for parsing lines like - 'no-op-component name=renorm input=Append(-3,0,3)' + 'no-op-component name=noop1 input=Append(-3,0,3)' which will produce just a single component, of type NoOpComponent. Parameters of the class, and their defaults: @@ -127,7 +183,7 @@ class XconfigLinearComponent(XconfigLayerBase): """This class is for parsing lines like 'linear-component name=linear1 dim=1024 input=Append(-3,0,3)' which will produce just a single component, of type LinearComponent, with - output-dim 1024 in this case, and input-dim determined by the dimention + output-dim 1024 in this case, and input-dim determined by the dimension of the input . Parameters of the class, and their defaults: @@ -137,7 +193,7 @@ class XconfigLinearComponent(XconfigLayerBase): The following (shown with their effective defaults) are just passed through to the component's config line. - orthonormal-constraint=-1 + orthonormal-constraint=0.0 max-change=0.75 l2-regularize=0.0 diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py index 20d9c73eaf0..99f622d79a7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py +++ b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py @@ -210,7 +210,9 @@ def process_args(args): # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() - if args.use_gpu: + if args.use_gpu in ["true", "false"]: + args.use_gpu = ("yes" if args.use_gpu == "true" else "no") + if args.use_gpu in ["yes", "wait"]: if not common_lib.check_if_cuda_compiled(): logger.warning( """You are running with one thread but you have not compiled @@ -219,10 +221,9 @@ def process_args(args): ./configure; make""") run_opts.train_queue_opt = "--gpu 1" - run_opts.parallel_train_opts = "" + run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) run_opts.combine_queue_opt = "--gpu 1" - run_opts.combine_gpu_opt = "" - + run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) else: logger.warning("Without using a GPU this will be very slow. " diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 613b70fd192..6a68d9ecb6e 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -228,7 +228,9 @@ def process_args(args): args.transform_dir = args.lat_dir # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() - if args.use_gpu: + if args.use_gpu in ["true", "false"]: + args.use_gpu = ("yes" if args.use_gpu == "true" else "no") + if args.use_gpu in ["yes", "wait"]: if not common_lib.check_if_cuda_compiled(): logger.warning( """You are running with one thread but you have not compiled @@ -237,9 +239,9 @@ def process_args(args): ./configure; make""") run_opts.train_queue_opt = "--gpu 1" - run_opts.parallel_train_opts = "" + run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) run_opts.combine_queue_opt = "--gpu 1" - run_opts.combine_gpu_opt = "" + run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) else: logger.warning("Without using a GPU this will be very slow. " diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 2cb314cca61..dd1c97b350d 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -118,7 +118,9 @@ def process_args(args): # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() - if args.use_gpu: + if args.use_gpu in ["true", "false"]: + args.use_gpu = ("yes" if args.use_gpu == "true" else "no") + if args.use_gpu in ["yes", "wait"]: if not common_lib.check_if_cuda_compiled(): logger.warning( """You are running with one thread but you have not compiled @@ -127,11 +129,12 @@ def process_args(args): ./configure; make""") run_opts.train_queue_opt = "--gpu 1" - run_opts.parallel_train_opts = "" - run_opts.combine_gpu_opt = "" + run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) + run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.combine_queue_opt = "--gpu 1" - run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.prior_queue_opt = "--gpu 1" + else: logger.warning("Without using a GPU this will be very slow. " "nnet3 does not yet support multiple threads.") diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index 14922247cd3..0e787b0b647 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -126,7 +126,9 @@ def process_args(args): # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() - if args.use_gpu: + if args.use_gpu in ["true", "false"]: + args.use_gpu = ("yes" if args.use_gpu == "true" else "no") + if args.use_gpu in ["yes", "wait"]: if not common_lib.check_if_cuda_compiled(): logger.warning( """You are running with one thread but you have not compiled @@ -135,10 +137,10 @@ def process_args(args): ./configure; make""") run_opts.train_queue_opt = "--gpu 1" - run_opts.parallel_train_opts = "" - run_opts.combine_gpu_opt = "" + run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) + run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.combine_queue_opt = "--gpu 1" - run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.prior_queue_opt = "--gpu 1" else: diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py index 4623756caba..bd94fb7cb94 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -179,7 +179,9 @@ def process_args(args): # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() - if args.use_gpu: + if args.use_gpu in ["true", "false"]: + args.use_gpu = ("yes" if args.use_gpu == "true" else "no") + if args.use_gpu in ["yes", "wait"]: if not common_lib.check_if_cuda_compiled(): logger.warning( """You are running with one thread but you have not compiled @@ -188,10 +190,10 @@ def process_args(args): ./configure; make""") run_opts.train_queue_opt = "--gpu 1" - run_opts.parallel_train_opts = "" - run_opts.combine_gpu_opt = "" + run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) + run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.combine_queue_opt = "--gpu 1" - run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.prior_queue_opt = "--gpu 1" else: diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index fd74e5c9f44..83a1da8eca1 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -173,7 +173,9 @@ def process_args(args): # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() - if args.use_gpu: + if args.use_gpu in ["true", "false"]: + args.use_gpu = ("yes" if args.use_gpu == "true" else "no") + if args.use_gpu in ["yes", "wait"]: if not common_lib.check_if_cuda_compiled(): logger.warning( """You are running with one thread but you have not compiled @@ -182,10 +184,10 @@ def process_args(args): ./configure; make""") run_opts.train_queue_opt = "--gpu 1" - run_opts.parallel_train_opts = "" - run_opts.combine_gpu_opt = "" + run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu) + run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.combine_queue_opt = "--gpu 1" - run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu) run_opts.prior_queue_opt = "--gpu 1" else: diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py index f8d5008c3e9..8c53e5e8db9 100755 --- a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py +++ b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py @@ -165,6 +165,8 @@ def run(args): axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time + end_frame_accounted = 0 + for i, utt in enumerate(utts): if utt not in segments or utt not in targets: num_utt_err += 1 @@ -208,45 +210,58 @@ def run(args): num_utt_err += 1 continue + # Fix end_frame and num_frames if the segment goes beyond + # the length of the recording. if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame - if num_frames < 0: + # Fix "num_frames" and "end_frame" if "num_frames" is lower + # than the size of the targets matrix "mat" + num_frames = min(num_frames, mat.shape[0]) + end_frame = start_frame + num_frames + + if num_frames <= 0: logger.warning("For utterance {utt}, start-frame {start} " "is outside the recording" "".format(utt=utt, start=start_frame)) num_utt_err += 1 continue - prev_utt_end_frame = ( - int(segments[utts[i-1]][2] / args.frame_shift + 0.5) - if i > 0 else 0) - if start_frame < prev_utt_end_frame: - # Segment overlaps with the previous utterance + if end_frame < end_frame_accounted: + logger.warning("For utterance {utt}, end-frame {end} " + "is before the end of a previous segment. " + "i.e. this segment is completely within " + "another segment. Ignoring this segment." + "".format(utt=utt, end=end_frame)) + num_utt_err +=1 + continue + + if start_frame < end_frame_accounted: + # Segment overlaps with a previous utterance # Combine targets using a weighted interpolation using a # triangular window with a weight of 1 at the start/end of # overlap and 0 at the end/start of the segment - for n in range(0, prev_utt_end_frame - start_frame): - w = float(n) / float(prev_utt_end_frame - start_frame) + for n in range(0, end_frame_accounted - start_frame): + w = float(n) / float(end_frame_accounted - start_frame) reco_mat[n + start_frame, :] = ( reco_mat[n + start_frame, :] * (1.0 - w) + mat[n, :] * w) - num_frames = min(num_frames, mat.shape[0]) - end_frame = start_frame + num_frames - reco_mat[prev_utt_end_frame:end_frame, :] = ( - mat[(prev_utt_end_frame-start_frame): - (end_frame-start_frame), :]) + if end_frame > end_frame_accounted: + reco_mat[end_frame_accounted:end_frame, :] = ( + mat[(end_frame_accounted-start_frame): + (end_frame-start_frame), :]) else: # No overlap with the previous utterances. # So just add it to the output. - num_frames = min(num_frames, mat.shape[0]) - reco_mat[start_frame:(start_frame + num_frames), :] = ( + reco_mat[start_frame:end_frame, :] = ( mat[0:num_frames, :]) logger.debug("reco_mat shape = %s, mat shape = %s, " "start_frame = %d, end_frame = %d", reco_mat.shape, mat.shape, start_frame, end_frame) + + end_frame_accounted = end_frame num_utt += 1 if reco_mat.shape[0] > 0: diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index f8557a70177..20bcfd96d96 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -66,8 +66,10 @@ if [ $# -ne 6 ]; then Usage: $0 e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a - Note: Both and must have the recording-id - as speaker, and must contain feats.scp. + Note: is expected to have feats.scp and + expected to have segments file. We will get the features for by + using row ranges of /feats.scp. This script will + work on a copy of created to have the recording-id as the speaker-id. EOF exit 1 fi @@ -97,8 +99,7 @@ else extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt" fi -for f in $in_data_dir/feats.scp $in_whole_data_dir/feats.scp \ - $in_data_dir/segments \ +for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \ $lang/phones.txt $garbage_phones_list $silence_phones_list \ $ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do if [ ! -f $f ]; then @@ -125,8 +126,7 @@ if [ $stage -le 0 ]; then utils/data/modify_speaker_info_to_recording.sh \ $in_data_dir $dir/$data_id || exit 1 - steps/compute_cmvn_stats.sh $dir/$data_id || exit 1 - utils/validate_data_dir.sh $dir/$data_id || exit 1 + utils/validate_data_dir.sh --no-feats $dir/$data_id || exit 1 fi # Work with a temporary data directory with recording-id as the speaker labels. @@ -135,6 +135,13 @@ data_dir=$dir/${data_id} ############################################################################### # Get feats for the manual segments ############################################################################### +if [ $stage -le 1 ]; then + utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp + cp $data_dir/tmp/feats.scp $data_dir + + steps/compute_cmvn_stats.sh $data_dir || exit 1 +fi + if [ $stage -le 2 ]; then utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh index 5db6be731ce..dd315cc405b 100755 --- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh +++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh @@ -46,7 +46,7 @@ utils/data/internal/combine_segments_to_recording.py \ --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1 if [ -f $data/text ]; then - utils/apply_map.pl -f 2 $data/text < $dir/reco2sorted_utts > $dir/text || exit 1 + utils/apply_map.pl -f 2- $data/text < $dir/reco2sorted_utts > $dir/text || exit 1 fi rm $dir/reco2sorted_utts diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index 47670a2065a..fa5ff7856b0 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -85,7 +85,7 @@ if [ $# -ne 4 ]; then echo " --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I" echo " # markers on phones to indicate word-internal positions. " echo " --share-silence-phones (true|false) # default: false; if true, share pdfs of " - echo " # all non-silence phones. " + echo " # all silence phones. " echo " --sil-prob # default: 0.5 [must have 0 <= silprob < 1]" echo " --phone-symbol-table # default: \"\"; if not empty, use the provided " echo " # phones.txt as phone symbol table. This is useful " @@ -115,7 +115,7 @@ silprob=false echo "*Error validating directory $srcdir*" && exit 1; if [[ ! -f $srcdir/lexicon.txt ]]; then - echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" + echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt" perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1; fi if [[ ! -f $srcdir/lexiconp.txt ]]; then diff --git a/src/configure b/src/configure index df7f0a96dd2..277bb340781 100755 --- a/src/configure +++ b/src/configure @@ -37,7 +37,7 @@ # # addition of the the --android-includes flag because the toolchains # # produced by the Android NDK don't always include the C++ stdlib # # headers in the normal cross compile include path. -# --host=aarch64-linux-android +# --host=aarch64-linux-android # # support for 64bit ARMv8(AArch64) architecture in Android. # This should be incremented after any significant change to the configure @@ -426,10 +426,10 @@ function configure_cuda { fi case $CUDA_VERSION in - 5_5) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;; - 6_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;; - 7_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;; - 8_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;; + 5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;; + 6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;; + 7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;; + 8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;; 9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70" ;; *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;; esac diff --git a/src/feat/wave-reader.cc b/src/feat/wave-reader.cc index 15b53d93d7d..f8259a3a82e 100644 --- a/src/feat/wave-reader.cc +++ b/src/feat/wave-reader.cc @@ -132,7 +132,23 @@ void WaveInfo::Read(std::istream &is) { uint32 riff_chunk_read = 0; riff_chunk_read += 4; // WAVE included in riff_chunk_size. - reader.Expect4ByteTag("fmt "); + // Possibly skip any RIFF tags between 'WAVE' and 'fmt '. + // Apple devices produce a filler tag 'JUNK' for memory alignment. + reader.Read4ByteTag(); + riff_chunk_read += 4; + while (strcmp(reader.tag,"fmt ") != 0) { + uint32 filler_size = reader.ReadUint32(); + riff_chunk_read += 4; + for (uint32 i = 0; i < filler_size; i++) { + is.get(); // read 1 byte, + } + riff_chunk_read += filler_size; + // get next RIFF tag, + reader.Read4ByteTag(); + riff_chunk_read += 4; + } + + KALDI_ASSERT(strcmp(reader.tag,"fmt ") == 0); uint32 subchunk1_size = reader.ReadUint32(); uint16 audio_format = reader.ReadUint16(); num_channels_ = reader.ReadUint16(); @@ -190,9 +206,8 @@ void WaveInfo::Read(std::istream &is) { KALDI_ERR << "Unexpected block_align: " << block_align << " vs. " << num_channels_ << " * " << (bits_per_sample/8); - riff_chunk_read += 8 + subchunk1_size; - // size of what we just read, 4 bytes for "fmt " + 4 - // for subchunk1_size + subchunk1_size itself. + riff_chunk_read += 4 + subchunk1_size; + // size of what we just read, 4 for subchunk1_size + subchunk1_size itself. // We support an optional "fact" chunk (which is useless but which // we encountered), and then a single "data" chunk. @@ -217,10 +232,7 @@ void WaveInfo::Read(std::istream &is) { riff_chunk_read += 4; } - if (strcmp(reader.tag, "data")) - KALDI_ERR << "WaveData: expected data chunk, got instead " - << reader.tag; - + KALDI_ASSERT(strcmp(reader.tag, "data") == 0); uint32 data_chunk_size = reader.ReadUint32(); riff_chunk_read += 4; diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc index 9a4559803ad..77d78113bbb 100644 --- a/src/nnet3/nnet-compile.cc +++ b/src/nnet3/nnet-compile.cc @@ -322,10 +322,23 @@ void Compiler::CreateStepInfo( stride_type); } else { // kDimRange. Will just be a sub-matrix of a Component or Input node. - int32 cindex_id = this_info.output_cindex_ids.front(), - input_cindex_id = graph_.dependencies[cindex_id][0], - input_step = cindex_id_to_location_[input_cindex_id].first; - KALDI_ASSERT(input_step != -1 && input_step < step); + std::vector::const_iterator + iter = this_info.output_cindex_ids.begin(), + end = this_info.output_cindex_ids.end(); + int32 source_cindex_id = -1; + for (; iter != end; ++iter) { + int32 cindex_id = *iter; + if (!graph_.dependencies[cindex_id].empty()) { + KALDI_ASSERT(graph_.dependencies[cindex_id].size() == 1); + source_cindex_id = graph_.dependencies[cindex_id][0]; + break; + } + } + KALDI_ASSERT(source_cindex_id >= 0); + int32 input_step = cindex_id_to_location_[source_cindex_id].first; + KALDI_ASSERT(this_info.output_cindex_ids.size() == + steps_[input_step].output_cindex_ids.size()); + KALDI_ASSERT(input_step >= 0 && input_step < step); KALDI_PARANOID_ASSERT(this_info.output_indexes == steps_[input_step].output_indexes); this_info.value = computation->NewSubMatrix(steps_[input_step].value, @@ -376,6 +389,8 @@ void Compiler::CreateStepInfo( KALDI_ASSERT(cur_dim_offset == desc.Dim(nnet_)); } } + KALDI_ASSERT(static_cast(this_info.output_cindex_ids.size()) == + computation->submatrices[this_info.value].num_rows); } } diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h index 955ac47cbe1..21918c0e539 100644 --- a/src/nnet3/nnet-compile.h +++ b/src/nnet3/nnet-compile.h @@ -134,7 +134,9 @@ class Compiler { std::vector *deriv_needed); // this sets up steps_, destroying the input "by_step" in the process. It - // also sets various matrix and sub-matrix sizes in "computation". + // also sets various matrix and sub-matrix sizes in "computation". The input + // 'by_step' is elsewhere referred to as just 'step'; it is a vector of steps, + // and each step is a vector of cindex_ids that are computed by that step. void CreateStepInfo(const std::vector &deriv_needed, const std::vector &step_to_segment, std::vector > *by_step, diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc index bded9e84b2f..9c84115d406 100644 --- a/src/nnet3/nnet-computation-graph.cc +++ b/src/nnet3/nnet-computation-graph.cc @@ -1643,7 +1643,7 @@ int32 ComputationStepsComputer::AddStep(const std::vector &cindexes, *out_iter = cindex_id; if (added) { KALDI_ASSERT(cindex_id == static_cast(locations_->size())); - locations_->resize(cindex_id + 1); + locations_->resize(cindex_id + 1, std::pair(-1, -1)); locations_->back().first = step_index; locations_->back().second = row_index; locations = &((*locations_)[0]); // in case it was reallocated @@ -1867,68 +1867,52 @@ void ComputationStepsComputer::ProcessDimRangeSubPhase( ConvertToCindexIds(input_cindexes, &input_cindex_ids); std::vector > locations; ConvertToLocations(input_cindex_ids, &locations); - std::sort(locations.begin(), locations.end()); + + // get a list of the source step indexes (corresponding to computations for the + // source component-node) + std::unordered_set source_step_indexes; KALDI_ASSERT(!locations.empty()); std::vector >::const_iterator locations_iter = locations.begin(), locations_end = locations.end(); - // Each unique .first number in locations (i.e. each source step, and they - // will all correspond to component-output or input steps) will generate one - // 'step' of type kDimRange. Because dim-range nodes must be contiguous - // ranges of a source step (since they are represented as sub-matrices), for - // each source step we work out the first and last row-index (i.e. first and - // last .second member of locations) and use that to reconstruct the range. - - // each element of 'steps' will be (source_step, (begin_row, end_row)) so that - // the source of the dim-range node is indexes begin_row ... end_row-1 in that - // source step. - std::vector > > steps; - - int32 cur_source_step = locations_iter->first, - cur_row_begin = locations_iter->second, - cur_row_end = cur_row_begin + 1; - while (1) { - ++locations_iter; - if (locations_iter == locations_end || - locations_iter->first != cur_source_step) { - // we reached the end of a run of the same step. - std::pair > this_step; - this_step.first = cur_source_step; - this_step.second.first = cur_row_begin; - this_step.second.second = cur_row_end; - steps.push_back(this_step); - if (locations_iter != locations_end) { - cur_source_step = locations_iter->first; - cur_row_begin = locations_iter->second; - cur_row_end = cur_row_begin + 1; - } else { - break; - } - } else { - cur_row_end = locations_iter->second + 1; + + // 'cur_source_step_index' is just an optimization to prevent unnecessary + // unordered_set inserts. + int32 cur_source_step_index = -1; + for (; locations_iter != locations_end; ++locations_iter) { + int32 source_step_index = locations_iter->first; + if (source_step_index != cur_source_step_index) { + cur_source_step_index = source_step_index; + source_step_indexes.insert(cur_source_step_index); } } - for (size_t i = 0; i < steps.size(); i++) { - // iterating over different source steps, although normally - // there will be just one. - int32 source_step = steps[i].first, - row_begin = steps[i].second.first, - row_end = steps[i].second.second; - // 'source' is just the elements of the source step that we're consuming. - std::vector source((*steps_)[source_step].begin() + row_begin, - (*steps_)[source_step].begin() + row_end); + std::unordered_set::const_iterator + source_step_iter = source_step_indexes.begin(), + source_step_end = source_step_indexes.end(); + // iterating over the indexes of the source steps. + for (; source_step_iter != source_step_end; ++source_step_iter) { + int32 source_step_index = *source_step_iter; + std::pair p(source_step_index, dim_range_node); + if (dim_range_nodes_.count(p) > 0) { + // We don't need to do anything; a dim-range node already exists for this + // step and this node index. + continue; + } + dim_range_nodes_.insert(p); + const std::vector &source_step = (*steps_)[source_step_index]; + // 'cindexes' will be the cindexes of the new step that we're going to add. std::vector cindexes; - ConvertToCindexes(source, &cindexes); + ConvertToCindexes(source_step, &cindexes); std::vector::iterator iter = cindexes.begin(), end = cindexes.end(); for (; iter != end; ++iter) iter->first = dim_range_node; bool add_if_absent = true; // this add_if_absent says, even if cindexes were not in the graph, - // add them. This is possible in principle; it's to satisfy the - // requirement that DimRangeNodes be implemented as contiguous ranges - // of rows of component nodes or input nodes. + // add them. This is possible; the step will contain all cindexes for the + // input step, even if they won't be needed. (This is costless; it's just + // setting up a sub-matrix). AddStep(cindexes, add_if_absent); } } diff --git a/src/nnet3/nnet-computation-graph.h b/src/nnet3/nnet-computation-graph.h index 7999f2208ad..c0662756502 100644 --- a/src/nnet3/nnet-computation-graph.h +++ b/src/nnet3/nnet-computation-graph.h @@ -439,7 +439,7 @@ class ComputationStepsComputer { /// (step-index, index-into-step), so that for any cindex_id c, /// (*steps)[locations[c].first][locations[c].second] == c. /// It's possible in principle if there are non-simple - /// Components, that for node corresponding to component-input + /// Components, that for nodes corresponding to component-input /// descriptors, a cindex might be present in more than one step, /// so it doesn't follow that if (*steps)[i][j] == c, then /// locations[c] == (i,j). @@ -547,6 +547,15 @@ class ComputationStepsComputer { /// (*steps_)[i][j] == c. This is also an output (we get the pointer in /// the constructor). std::vector > *locations_; + + + /// dim_range_nodes_ is used when allocating steps for nodes of type kDimRangeNode. + /// This is a set of (source_step, dim_range_node_index), + /// where source_step is the step in which we computed of the input + /// of the dim-range node (this step will be for a node of type kComponentNode). + /// This just tells us whether we've already added a particular dim-range node + /// for this step, so we know whether we need to add it again. + std::unordered_set, PairHasher > dim_range_nodes_; }; diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h index d056a71498c..aefcb94c465 100644 --- a/src/nnet3/nnet-computation.h +++ b/src/nnet3/nnet-computation.h @@ -395,12 +395,17 @@ struct NnetComputation { // These are owned here. std::vector component_precomputed_indexes; - // used in kAddRows, kAddToRows, kCopyRows, kCopyToRows. contains row-indexes. + // Used in commands kAddRows, kAddToRows, kCopyRows, which + // contain indexes into this data-member. + // Each vector is a vector of row-indexes (with -1 usually treated as + // a special case meaning "don't do anything for this row" for add + // commands, or "use zero" for copy commands. std::vector > indexes; - // used kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti. - // contains pairs (sub-matrix index, row index)- or (-1,-1) meaning don't - // do anything for this row. + // Used in commands kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti and + // kCopyToRowsMulti. Contains pairs (sub-matrix index, row index)- or the + // special pair (-1,-1) meaning "don't do anything for this row" for add + // commands, or "use zero" for copy commands. std::vector > > indexes_multi; diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index c53fba815fb..756ea45e894 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -2576,6 +2576,328 @@ bool SnipRowOps(NnetComputation *computation) { +// This class implements the internals of the function SplitRowOps() which is +// declared in nnet-optimize-utils.h. +class RowOpsSplitter { + public: + RowOpsSplitter(NnetComputation *computation): computation_(computation) { } + + // Attempts to perform the optimization. Returns true if it made any change + // to the computation. + bool Split() { + return SplitIndexes() && SplitCommands(); + } + + private: + + // This function sets up split_info_, which describes how we can split up + // the vectors that are elements of computation_->indexes_multi. + // It will return true if it successfully split at least one of those + // vectors, and false otherwise. + bool SplitIndexes(); + + // This function modifies the commands in the computation. It returns + // true if it made any change. + bool SplitCommands(); + + + // This function attempts to optimize the command in + // computation_->commands[command_index]. It returns true if it made any + // change. If we are going to have to insert an extra command into the + // computation, this function will append an element to new_commands_. + bool SplitCommand(int32 command_index); + + // Below, define a multi-index as an element of NnetComputation::indexes_multi, + // for example, + // const std::vector > &multi_index = computation_->indexes_multi[1]; + // It is a list of pairs. + + // This struct appears as an element of the list inside MultiIndexSplitInfo. + // It helps us describe how we can split up a multi-index (a list of pairs) + // into a sequence of ranges where the .first value is constant across the + // range. + struct SingleSplitInfo { + // 'offset' is the index into the vector of pairs that forms the + // start of this range. In the example where we are splitting up + // ((10,2), (10,3), (10,4), (15,3), (15,5), (15,7)) + // there would be two instances of struct SingleSplitInfo, with + // offset = 0 and offset = 3. + int32 offset; + // 'size' is the number of pairs in this range; in the example + // above, both 'size' elements would be 3. + int32 size; + // first_value is the value of the .first index throughout this range; in + // the example above, it would be 10 and 15 respectively. It represents a + // submatrix index. + int32 first_value; + + // initial_second_value is the minimum value of .second for any element in + // this range: it would be 2 and 3 respectively in the example above. + int32 min_second_value; + + // second_value_range is the highest value of .second for any element in + // this range, plus one, minus min_second_value. (It's the number of rows + // in the other submatrix of the operation). + int32 second_value_range; + + // If the .second values in the range are consecutive then + // 'second_value_offsets' will be empty. Otherwise it will + // be a vector of size 'size', containing numbers in the + // range 0 ... second_value_range - 1, such that + // min_second_value + second_value_offsets[i] gives + // the .second value at the corresponding position in the range. + // In the second range of the example above, the range + // consisting of ((15,3), (15,5), (15,7)), 'second_value_offsets + // would be the vector (0, 2, 4). + std::vector second_value_offsets; + }; + + // An instance of the struct MultiIndexSplitInfo will be created for each multi-index, + // i.e. for each element of NnetComputation::indexes_multi. + struct MultiIndexSplitInfo { + // If we can split this multi-index into at most two ranges, this + // vector will be nonempty; otherwise it will be empty. + std::vector splits; + }; + + // GetSplitInfo() attempts to take a range of a + // std::vector >, as represented by begin and end + // iterators, and to extract its information into an object of type + // SingleSplitInfo. (all except for the .offset member, which will have + // been set by calling code). + // It return true if successful, and false otherwise. The only reasons that + // it might return false are that the range contains -1's or does not contain + // all-identical .first members). + bool GetSplitInfo(std::vector >::const_iterator begin, + std::vector >::const_iterator end, + SingleSplitInfo *info); + + // computation_ is the computation that we are modifying. + NnetComputation *computation_; + // split_info_ will contain information about how we can split up the members + // of computation_->indexes_multi into ranges. + std::vector split_info_; + // The following is a list of additional commands that we are going to insert + // into computation_, of the form (command-index, command) where command-index + // is a command index just before which we will insert the new command. + // (this is the format accepted by the function InsertCommands()). + std::vector > new_commands_; + +}; + + +bool RowOpsSplitter::GetSplitInfo( + std::vector >::const_iterator begin, + std::vector >::const_iterator end, + SingleSplitInfo *info) { + // max_size_ratio must be > 1.0, and could in principle be a float. It is + // there to prevent us from making changes to the computation which would end + // up wastefully launching too many kernels that would do nothing. + const int32 max_size_ratio = 2; + + int32 size = end - begin; + KALDI_ASSERT(size != 0); + int32 first = begin->first; + if (first < 0) + return false; + info->size = size; + info->first_value = first; + int32 initial_second_value = begin->second, + min_second_value = initial_second_value, + max_second_value = initial_second_value; + info->second_value_offsets.resize(size); + bool is_consecutive = true; + for (int32 i = 0; i < size; i++) { + int32 second = begin[i].second; + if (begin[i].first != first || second < 0) return false; + info->second_value_offsets[i] = second; + if (second != initial_second_value + i) + is_consecutive = false; + if (second < min_second_value) min_second_value = second; + if (second > max_second_value) max_second_value = second; + } + info->min_second_value = min_second_value; + info->second_value_range = max_second_value + 1 - min_second_value; + if (info->second_value_range > size * max_size_ratio) + return false; + if (is_consecutive) { + info->second_value_offsets.clear(); + } else { + for (int32 i = 0; i < size; i++) + info->second_value_offsets[i] -= min_second_value; + } + return true; +} + + +bool RowOpsSplitter::SplitIndexes() { + bool ans = false; + int32 num_indexes_multi = computation_->indexes_multi.size(); + split_info_.resize(num_indexes_multi); + for (int32 i = 0; i < num_indexes_multi; i++) { + const std::vector > &multi_index = + computation_->indexes_multi[i]; + MultiIndexSplitInfo &split_info = split_info_[i]; + + int32 num_pairs = multi_index.size(); + KALDI_ASSERT(num_pairs > 0); + // 'split_point' will be set to the first index j for which + // multi_index[j-1].first != multi_index[j].first, or -1 + // if no such j exists. + int32 split_point = -1, initial_first = multi_index[0].first; + for (int32 j = 1; j < num_pairs; j++) { + if (multi_index[j].first != initial_first) { + split_point = j; + break; + } + } + if (split_point == -1) { + split_info.splits.resize(1); + split_info.splits[0].offset = 0; + if (!GetSplitInfo(multi_index.begin(), multi_index.end(), + &(split_info.splits[0]))) { + split_info.splits.clear(); + } else { + ans = true; + } + } else { + split_info.splits.resize(2); + split_info.splits[0].offset = 0; + split_info.splits[1].offset = split_point; + + std::vector >::const_iterator mid_iter = + multi_index.begin() + split_point; + if (!GetSplitInfo(multi_index.begin(), mid_iter, + &(split_info.splits[0])) || + !GetSplitInfo(mid_iter, multi_index.end(), + &(split_info.splits[1]))) { + split_info.splits.clear(); + } else { + ans = true; + } + } + } + return ans; +} + +bool RowOpsSplitter::SplitCommand(int32 c) { + NnetComputation::Command &command = computation_->commands[c]; + CommandType command_type = command.command_type; + // For commands that are not of the following four types, return false: we + // won't be changing these commands. + switch (command_type) { + case kAddRowsMulti: case kCopyRowsMulti: + case kAddToRowsMulti: case kCopyToRowsMulti: break; + default: return false; + } + int32 indexes_multi_index = command.arg2; + KALDI_ASSERT(indexes_multi_index < + static_cast(split_info_.size())); + const MultiIndexSplitInfo &split_info = split_info_[indexes_multi_index]; + if (split_info.splits.empty()) + return false; // these indexes couldn't be split: e.g. they contained more + // than two distinct .first elements, or there were other + // reasons. + + // we'll be splitting the command into either one or two pieces. + std::vector split_commands( + split_info.splits.size()); + for (size_t i = 0; i < split_info.splits.size(); i++) { + const SingleSplitInfo &split = split_info.splits[i]; + NnetComputation::Command &command_out = split_commands[i]; + command_out.alpha = command.alpha; + command_out.arg1 = computation_->NewSubMatrix( + command.arg1, split.offset, split.size, 0, -1); + command_out.arg2 = computation_->NewSubMatrix( + split.first_value, split.min_second_value, + split.second_value_range, 0, -1); + + if (split.second_value_offsets.empty()) { + // The .second elements are consecutive. + switch (command_type) { + case kAddRowsMulti: + command_out.command_type = kMatrixAdd; + break; + case kCopyRowsMulti: + command_out.command_type = kMatrixCopy; + break; + case kAddToRowsMulti: + command_out.command_type = kMatrixAdd; + std::swap(command_out.arg1, command_out.arg2); + break; + case kCopyToRowsMulti: + command_out.command_type = kMatrixCopy; + std::swap(command_out.arg1, command_out.arg2); + break; + default: // will never be reached. + break; + } + } else { + // Indexes are not consecutive: it needs to be a kAddRows or kCopyRows + // command. + command_out.arg3 = computation_->indexes.size(); + switch (command_type) { + case kAddRowsMulti: case kCopyRowsMulti: { + command_out.command_type = (command_type == kAddRowsMulti ? + kAddRows : kCopyRows); + computation_->indexes.push_back(split.second_value_offsets); + break; + } + case kCopyToRowsMulti: { + // We can't operate on this command because of what would happen + // with values of 'indexes' (see the variable in the block for + // kAddToRowsMulti) which were -1. Rows of the output would be + // set to zero, which is not the behavior we want here; we'd want + // them to be unaffected. + return false; + } + case kAddToRowsMulti: { + command_out.command_type = kAddRows; + std::swap(command_out.arg1, command_out.arg2); + // invert the indexes. + std::vector indexes(split.second_value_range, -1); + for (int32 i = 0; i < split.size; i++) { + // the following assert should always succeed because the + // AddToRowsMulti and CopyToRowsMulti should never have + // duplicate destinations in their indexes. + KALDI_ASSERT(indexes[split.second_value_offsets[i]] >= 0); + indexes[split.second_value_offsets[i]] = i; + } + computation_->indexes.push_back(indexes); + break; + } + default: + KALDI_ERR << "Code error: un-handled case."; + } + } + } + command = split_commands[0]; + // note: for now, split_commands.size() will be 1 or 2. + for (size_t i = 1; i < split_commands.size(); i++) { + new_commands_.resize(new_commands_.size() + 1); + // we'll want to insert this command right after command c, + // which is the same as just before command c + 1. + new_commands_.back().first = c + 1; + new_commands_.back().second = split_commands[i]; + } + return true; // We made a change. +} + +bool RowOpsSplitter::SplitCommands() { + bool ans = false; + int32 num_commands = computation_->commands.size(); + for (int32 c = 0; c < num_commands; c++) + if (SplitCommand(c)) + ans = true; + if (!new_commands_.empty()) + InsertCommands(&new_commands_, computation_); + return ans; +} + +bool SplitRowOps(NnetComputation *computation) { + RowOpsSplitter splitter(computation); + return splitter.Split(); +} /* diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h index 703f43af095..32adf9e3e19 100644 --- a/src/nnet3/nnet-optimize-utils.h +++ b/src/nnet3/nnet-optimize-utils.h @@ -455,6 +455,23 @@ bool ReplaceRowWithMatrixOps(NnetComputation *computation); /// computation->indexes. bool SnipRowOps(NnetComputation *computation); + +/// This function detects cases where commands of type kAddRowsMulti, +/// kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti use indexes that +/// correspond to at most two submatrices, in two distinct ranges without gaps +/// filled by -1's, and could be converted to at most two commands of type +/// kMatrixAdd, kMatrixCopy, kAddRows or kCopyRows. (Note: it's important that +/// this optimization takes place after SnipRowOps, because it doesn't remove +/// the -1's from the edges of the indexes, it relies on that operation doing +/// so). The "without-gaps" stipulation is just for convenience of +/// implementation, to have fewer cases to worry about. +/// +/// This function returns true if it made any changes to the computation; if it +/// returns true, then after calling this you should at some point do +/// RenumberComputation(), which will remove any now-unused members of +/// computation->indexes. +bool SplitRowOps(NnetComputation *computation); + /// This function detects submatrices and matrices that are never used (e.g. due /// to changes made in other optimization code), and members of indexes, /// indexes_multi and indexes_ranges that are unused or are duplicates, and memo @@ -535,18 +552,18 @@ void IdentifyIndexesRangesArgs(std::vector *commands, std::vector *indexes_ranges_args); /// Inserts commands into the computation at the requested places. 'commands' -/// is a list of pairs (command-index, command) that is expected to be sorted -/// on command-index. For each entry (c, command) in 'commands', 'command' is -/// inserted into 'computation' just *before* the command that (at entry) is in -/// computation->commands[c]. If there are multiple pairs with the same index -/// c, they will remain in the same order in which they were present in -/// 'commands'; however, 'commands' does not have to be sorted on 'c'. -/// As a special case, if c == computation->commands.size(), the -/// corresponding commands are inserted at the beginning of the computation. -/// This function will appropriately renumber the argument of the kGotoLabel -/// command of any 'looped' computation. Command indexes c in commands[*].first -/// must be in the range [0, computation->commands.size()]. -/// This function may modify 'commands' by sorting it. +/// is a list of pairs (command-index, command) that is expected to be sorted on +/// command-index. For each entry (c, command) in 'commands', 'command' is +/// inserted into 'computation' just *before* the command that (at entry) is in +/// computation->commands[c]. If there are multiple pairs with the same index +/// c, they will remain in the same order in which they were present in +/// 'commands'; however, 'commands' does not have to be sorted on 'c'. As a +/// special case, if c == computation->commands.size(), the corresponding +/// commands are inserted at the beginning of the computation. This function +/// will appropriately renumber the argument of the kGotoLabel command of any +/// 'looped' computation. Command indexes c in commands[*].first must be in the +/// range [0, computation->commands.size()]. This function may modify +/// 'commands' by sorting it. void InsertCommands( std::vector > *commands, NnetComputation *computation); diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index d614afce7d0..ecce196801b 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -41,6 +41,14 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) { if (tok == "") { ReadBasicType(is, binary, &optimize_row_ops); ReadToken(is, binary, &tok); + } else { + optimize_row_ops = true; + } + if (tok == "") { + ReadBasicType(is, binary, &split_row_ops); + ReadToken(is, binary, &tok); + } else { + split_row_ops = true; } KALDI_ASSERT(tok == ""); ReadBasicType(is, binary, &convert_addition); @@ -516,12 +524,16 @@ void Optimize(const NnetOptimizeOptions &config, } - if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) { + if (config.optimize && (config.snip_row_ops || config.optimize_row_ops || + config.split_row_ops)) { bool must_renumber = false; if (config.snip_row_ops && SnipRowOps(computation)) must_renumber = true; + if (config.split_row_ops && SplitRowOps(computation)) + must_renumber = true; if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation)) must_renumber = true; + if (must_renumber) { RenumberComputation(computation); if (GetVerboseLevel() >= 3) diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index 31872e46b72..a07c5490c5c 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -39,6 +39,7 @@ struct NnetOptimizeOptions { bool propagate_in_place; bool backprop_in_place; bool optimize_row_ops; + bool split_row_ops; bool extend_matrices; bool convert_addition; bool remove_assignments; @@ -63,6 +64,7 @@ struct NnetOptimizeOptions { propagate_in_place(true), backprop_in_place(true), optimize_row_ops(true), + split_row_ops(true), extend_matrices(true), convert_addition(true), remove_assignments(true), @@ -95,6 +97,10 @@ struct NnetOptimizeOptions { opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to " "disable certain optimizations that act on operations of " "type *Row*."); + opts->Register("split-row-ops", &split_row_ops, "Set to false to disable " + "an optimization that may replace some operations of type " + "kCopyRowsMulti or kAddRowsMulti with up to two simpler " + "operations."); opts->Register("convert-addition", &convert_addition, "Set to false to " "disable the optimization that converts Add commands into " "Copy commands wherever possible."); diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 48a97df9ea1..e68321b3260 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -679,26 +679,36 @@ void GenerateConfigSequenceLstmWithTruncation( } std::string spliced_input = temp_string_stream.str(); - std::string c_tminus1 = "IfDefined(Offset(c_t, -1))"; + int32 offset = RandInt(-3, 3); + if (offset == 0) + offset = -1; + + + std::string c_tminus1; + { + std::ostringstream os_temp; + os_temp << "IfDefined(Offset(c_t, " << offset << "))"; + c_tminus1 = os_temp.str(); + } os << "component-node name=c_t component=c input=Sum(c1_t, c2_t)\n"; // i_t os << "component-node name=i1 component=Wi-xr input=Append(" - << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n"; os << "component-node name=i2 component=Wic " << " input=" << c_tminus1 << std::endl; os << "component-node name=i_t component=i input=Sum(i1, i2)\n"; // f_t os << "component-node name=f1 component=Wf-xr input=Append(" - << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n"; os << "component-node name=f2 component=Wfc " << " input=" << c_tminus1 << std::endl; os << "component-node name=f_t component=f input=Sum(f1, f2)\n"; // o_t os << "component-node name=o1 component=Wo-xr input=Append(" - << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n"; os << "component-node name=o2 component=Woc input=Sum(c1_t, c2_t)\n"; os << "component-node name=o_t component=o input=Sum(o1, o2)\n"; @@ -707,7 +717,7 @@ void GenerateConfigSequenceLstmWithTruncation( // g_t os << "component-node name=g1 component=Wc-xr input=Append(" - << spliced_input << ", IfDefined(Offset(r_t, -1)))\n"; + << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n"; os << "component-node name=g_t component=g input=g1\n"; // parts of c_t @@ -758,6 +768,10 @@ void GenerateConfigSequenceLstmType2( cell_dim = 40 + Rand() % 50, projection_dim = std::ceil(cell_dim / (Rand() % 10 + 2)); + int32 offset = RandInt(-3, 3); + if (offset == 0) + offset = -1; + os << "input-node name=input dim=" << input_dim << std::endl; // Parameter Definitions W*(* replaced by - to have valid names) os << "component name=W-x type=NaturalGradientAffineComponent input-dim=" @@ -819,10 +833,13 @@ void GenerateConfigSequenceLstmType2( } os << ")\n"; - os << "component-node name=W-r component=W-r input=IfDefined(Offset(r_t, -1))\n"; + os << "component-node name=W-r component=W-r input=IfDefined(Offset(r_t" + << offset << "))\n"; os << "component-node name=W-m component=W-m input=m_t \n"; - os << "component-node name=Wic component=Wic input=IfDefined(Offset(c_t, -1))\n"; - os << "component-node name=Wfc component=Wfc input=IfDefined(Offset(c_t, -1))\n"; + os << "component-node name=Wic component=Wic input=IfDefined(Offset(c_t" + << offset << "))\n"; + os << "component-node name=Wfc component=Wfc input=IfDefined(Offset(c_t" + << offset << "))\n"; os << "component-node name=Woc component=Woc input=c_t\n"; // Splitting the outputs of W*m node @@ -857,7 +874,8 @@ void GenerateConfigSequenceLstmType2( os << "component-node name=i_t component=i_t input=Sum(W_ix-x_t, Sum(W_ir-r_tminus1, Wic))\n"; os << "component-node name=f_t component=f_t input=Sum(W_fx-x_t, Sum(W_fr-r_tminus1, Wfc))\n"; os << "component-node name=o_t component=o_t input=Sum(W_ox-x_t, Sum(W_or-r_tminus1, Woc))\n"; - os << "component-node name=f_t-c_tminus1 component=f_t-c_tminus1 input=Append(f_t, Offset(c_t, -1))\n"; + os << "component-node name=f_t-c_tminus1 component=f_t-c_tminus1 input=Append(f_t, Offset(c_t" + << offset << "))\n"; os << "component-node name=i_t-g component=i_t-g input=Append(i_t, g)\n"; os << "component-node name=m_t component=m_t input=Append(o_t, h)\n"; diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 2874d6fd14e..fa6ade55864 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -1867,19 +1867,7 @@ class ModelCollapser { void CollapseModel(const CollapseModelConfig &config, Nnet *nnet) { ModelCollapser c(config, nnet); - std::string info_before_collapse; - if (GetVerboseLevel() >= 4) - info_before_collapse = nnet->Info(); c.Collapse(); - if (GetVerboseLevel() >= 4) { - std::string info_after_collapse = nnet->Info(); - if (info_after_collapse != info_before_collapse) { - KALDI_VLOG(4) << "Collapsing model: info before collapse was: " - << info_before_collapse - << ", info after collapse was:" - << info_after_collapse; - } - } } bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc index d6b4b1ded5d..3cd56ef1c74 100644 --- a/src/nnet3bin/nnet3-compute.cc +++ b/src/nnet3bin/nnet3-compute.cc @@ -39,7 +39,7 @@ int main(int argc, char *argv[]) { "If --apply-exp=true, apply the Exp() function to the output " "before writing it out.\n" "\n" - "Usage: nnet3-compute [options] \n" + "Usage: nnet3-compute [options] \n" " e.g.: nnet3-compute final.raw scp:feats.scp ark:nnet_prediction.ark\n" "See also: nnet3-compute-from-egs\n"; @@ -49,7 +49,7 @@ int main(int argc, char *argv[]) { NnetSimpleComputationOptions opts; opts.acoustic_scale = 1.0; // by default do no scaling in this recipe. - bool apply_exp = false; + bool apply_exp = false, use_priors = false; std::string use_gpu = "yes"; std::string word_syms_filename; @@ -74,6 +74,9 @@ int main(int argc, char *argv[]) { "output"); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("use-priors", &use_priors, "If true, subtract the logs of the " + "priors stored with the model (in this case, " + "a .mdl file is expected as input)."); po.Read(argc, argv); @@ -90,12 +93,26 @@ int main(int argc, char *argv[]) { feature_rspecifier = po.GetArg(2), matrix_wspecifier = po.GetArg(3); - Nnet nnet; - ReadKaldiObject(nnet_rxfilename, &nnet); + Nnet raw_nnet; + AmNnetSimple am_nnet; + if (use_priors) { + bool binary; + TransitionModel trans_model; + Input ki(nnet_rxfilename, &binary); + trans_model.Read(ki.Stream(), binary); + am_nnet.Read(ki.Stream(), binary); + } else { + ReadKaldiObject(nnet_rxfilename, &raw_nnet); + } + Nnet &nnet = (use_priors ? am_nnet.GetNnet() : raw_nnet); SetBatchnormTestMode(true, &nnet); SetDropoutTestMode(true, &nnet); CollapseModel(CollapseModelConfig(), &nnet); + Vector priors; + if (use_priors) + priors = am_nnet.Priors(); + RandomAccessBaseFloatMatrixReader online_ivector_reader( online_ivector_rspecifier); RandomAccessBaseFloatVectorReaderMapped ivector_reader( @@ -139,7 +156,6 @@ int main(int argc, char *argv[]) { } } - Vector priors; DecodableNnetSimple nnet_computer( opts, nnet, priors, features, &compiler, diff --git a/tools/extras/install_beamformit.sh b/tools/extras/install_beamformit.sh index db767682467..e61b6645c36 100755 --- a/tools/extras/install_beamformit.sh +++ b/tools/extras/install_beamformit.sh @@ -5,7 +5,7 @@ # libsndfile needed by beamformit [ ! -f libsndfile-1.0.25.tar.gz ] && \ wget http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.25.tar.gz -[ ! -d liblbfgs-1.10] && \ +[ ! -d libsndfile-1.0.25 ] && \ tar xzf libsndfile-1.0.25.tar.gz ( cd libsndfile-1.0.25