diff --git a/egs/fisher_english/s5/local/chain/compare_wer_general.sh b/egs/fisher_english/s5/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..2f724c8ff81 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/compare_wer_general.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} + + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain_cleaned/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=("# WER on dev " "# WER on test ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} test${epoch_infix}) + wer=$(grep WER $dirname/decode_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} test${epoch_infix}) + wer=$(grep WER $dirname/decode_looped_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done + +echo diff --git a/egs/fisher_english/s5/local/chain/run_semisupervised.sh b/egs/fisher_english/s5/local/chain/run_semisupervised.sh new file mode 100755 index 00000000000..7b9c56e063e --- /dev/null +++ b/egs/fisher_english/s5/local/chain/run_semisupervised.sh @@ -0,0 +1,148 @@ +#!/bin/bash + +set -e -o pipefail + +stage=-2 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_egs_weight=1.0 +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 4.5" +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat <$n1?$n2:$n1)) + num_archives=$[num_archives*3/2] + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + nnet3-chain-copy-egs "ark:cat $sup_egs_dir/combine.cegs $unsup_egs_dir/combine.cegs |" ark:$comb_egs_dir/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + echo $num_archives > $comb_egs_dir/info/num_archives + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + [ -f $sup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + [ -f $unsup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 12 --remove-egs false --train-set $supervised_set \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh new file mode 100755 index 00000000000..7352152a3bc --- /dev/null +++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh @@ -0,0 +1,202 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train +tree_affix= +nnet3_affix= +xent_regularize=0.1 +hidden_dim=725 +num_leaves=11000 # number of PdfIds in chain modeling + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 +num_jobs_initial=3 +num_jobs_final=16 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +frames_per_iter=1500000 + +gmm=tri5a +build_tree_ali_dir=exp/tri4a_ali # used to make a new tree for chain topology, should match train data +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" $num_leaves $build_tree_train_data_dir $lang $build_tree_ali_dir $treedir || exit 1; +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..6505381b03f --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +generate_alignments=true # false if doing chain training +speed_perturb=true +train_set=train + +lda_train_set=train_100k +nnet3_affix= +gmm=tri2_ali # should also contain alignments for $lda_train_set + +. ./path.sh +. ./utils/parse_options.sh + +gmm_dir=exp/$gmm + +# perturbed data preparation +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have + # to perturb the normal data to get the alignments. + # _sp stands for speed-perturbed + + for datadir in ${train_set}; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 + utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${datadir}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + utils/fix_data_dir.sh data/${datadir}_sp + rm -r data/temp0 data/${datadir}_tmp + done + fi + + if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang exp/tri5a exp/tri5a_ali_${train_set}_sp || exit 1 + fi + train_set=${train_set}_sp +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + # the 100k directory is copied seperately, as + # we want to use exp/tri2_ali for lda_mllt training + # the main train directory might be speed_perturbed + for dataset in $train_set $lda_train_set; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + + # scale the waveforms, this is useful as we don't use CMVN + data_dir=data/${dataset}_hires + cat $data_dir/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; + mv $data_dir/wav.scp_scaled $data_dir/wav.scp + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + + for dataset in test dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done + + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr +fi + +# ivector extractor training +if [ $stage -le 4 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/${lda_train_set}_hires \ + data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a +fi + +if [ $stage -le 5 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # We extract iVectors on all the ${train_set} data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_max2_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires || exit 1; + + for dataset in test dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${dataset}_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + done +fi + +exit 0; diff --git a/egs/fisher_english/s5/local/nnet3/run_tdnn.sh b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh new file mode 100644 index 00000000000..f055b853b61 --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# This script is not tested. + +# this is the standard "tdnn" system, built in nnet3; it's what we used to +# call multi-splice. + +. ./cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < data/$supervised_set/supervised_uttlist || true + utils/shuffle_list.pl data/$base_train_set/feats.scp | cut -d' ' -f1 | \ + tail -$num_unsupervised_utts > data/$supervised_set/unsupervised_uttlist || true + utils/subset_data_dir.sh --utt-list data/$supervised_set/supervised_uttlist \ + data/$base_train_set data/$supervised_set || exit 1 + utils/subset_data_dir.sh --utt-list data/$supervised_set/unsupervised_uttlist \ + data/$base_train_set data/$unsupervised_set || exit 1 + utils/data/subset_data_dir.sh --utt-list data/$unsupervised_set/feats.scp \ + data/${base_train_set}_sp_hires data/${unsupervised_set}_hires +fi + +if [ $stage -le -3 ]; then + # align the supervised subset with the current cleaned gmm + if [ -f $gmm/ali.1.gz ]; then + echo "$0: alignments in $gmm appear to already exist. Please either remove them " + echo " ... or use a later --stage option." + exit 1 + fi + echo "$0: aligning the supervised data data/${supervised_set}" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${supervised_set} data/lang exp/$base_gmm exp/$gmm +fi + +if [ $stage -le -2 ]; then + echo "$0: chain training on the supervised subset data/${supervised_set}" + local/chain/run_tdnn.sh $train_supervised_opts --remove-egs false \ + --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix +fi + +if [ $stage -le -1 ]; then + echo "$0: getting ivectors for the hires unsupervised data data/${unsupervised_set}_hires" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ + data/${unsupervised_set}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires +fi + +chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +if [ $stage -le 0 ]; then + echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set}${decode_affix} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${unsupervised_set}_hires \ + ${chaindir}/decode_${unsupervised_set}${decode_affix} ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore + ln -s ../final.mdl $chaindir/decode_${unsupervised_set}${decode_affix}_rescore/final.mdl || true +fi + +if [ $stage -le 1 ]; then + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $left_tolerance --right-tolerance $right_tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --egs-weight $unsup_egs_weight \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir \ + ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore $chaindir/unsup_egs${decode_affix}${egs_affix} +fi + +sup_egs_dir=$chaindir/egs +unsup_egs_dir=$chaindir/unsup_egs${decode_affix}${egs_affix} +comb_egs_dir=$chaindir/comb_egs${decode_affix}${egs_affix} +if [ $stage -le 2 ]; then + echo "$0: combining supervised/unsupervised egs" + num_archives=`cat $chaindir/egs/info/num_archives` + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 17 --remove-egs false --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi + diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh index 99921a9bf61..1c4a032fc57 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh @@ -52,6 +52,7 @@ train_set=train_cleaned gmm=tri3_cleaned # the gmm for the target data num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. @@ -59,6 +60,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=1d #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -212,13 +214,13 @@ if [ $stage -le 18 ]; then --egs.chunk-width 150 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ + --cleanup.remove-egs $remove_egs \ --feat-dir $train_data_dir \ --tree-dir $tree_dir \ --lat-dir $lat_dir \ diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index c1d45e41de7..076dc95b2d7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -64,6 +64,14 @@ online_ivector_dir= # can be used if we are including speaker information as iV cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions +egs_weight=1.0 # The weight which determines how much each training example + # contributes to gradients while training (can be used + # to down/up-weight a dataset) +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. +phone_insertion_penalty= echo "$0 $@" # Print the command line for logging @@ -288,8 +296,10 @@ fi if [ $stage -le 2 ]; then echo "$0: copying training lattices" + [ ! -z $lattice_prune_beam ] && \ + prune_cmd="ark:- | lattice-prune --acoustic-scale=0.1 --beam=$lattice_prune_beam ark:-" $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" $prune_cmd ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp fi @@ -307,6 +317,12 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" +[ ! -z $lattice_lm_scale ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + +[ ! -z $phone_insertion_penalty ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --phone-ins-penalty=$phone_insertion_penalty" + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial @@ -385,6 +401,7 @@ if [ $stage -le 4 ]; then utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ + --weight=$egs_weight \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ --num-frames-overlap=$frames_overlap_per_eg \ diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index b5597b15667..35489ca5e22 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -19,6 +19,7 @@ #include "chain/chain-supervision.h" #include "lat/lattice-functions.h" +#include "lat/push-lattice.h" #include "util/text-utils.h" #include "hmm/hmm-utils.h" #include @@ -142,9 +143,9 @@ bool ProtoSupervision::operator == (const ProtoSupervision &other) const { fst::Equal(fst, other.fst)); } -bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, - const CompactLattice &lat, - ProtoSupervision *proto_supervision) { +bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { opts.Check(); if (lat.NumStates() == 0) { KALDI_WARN << "Empty lattice provided"; @@ -176,9 +177,10 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return false; } proto_supervision->fst.AddArc(state, - fst::StdArc(phone, phone, - fst::TropicalWeight::One(), - lat_arc.nextstate)); + fst::StdArc(phone, phone, + fst::TropicalWeight(lat_arc.weight.Weight().Value1() + * opts.lm_scale + opts.phone_ins_penalty), + lat_arc.nextstate)); int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), t_end = std::min(num_frames, (next_state_time + opts.right_tolerance)), @@ -189,7 +191,8 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, proto_supervision->allowed_phones[t_subsampled].push_back(phone); } if (lat.Final(state) != CompactLatticeWeight::Zero()) { - proto_supervision->fst.SetFinal(state, fst::TropicalWeight::One()); + proto_supervision->fst.SetFinal(state, fst::TropicalWeight( + lat.Final(state).Weight().Value1() * opts.lm_scale)); if (state_times[state] != num_frames) { KALDI_WARN << "Time of final state " << state << " in lattice is " << "not equal to number of frames " << num_frames @@ -207,6 +210,16 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return true; } +bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { + if (!PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision)) + return false; + if (opts.lm_scale != 0.0) + fst::Push(&(proto_supervision->fst), + fst::REWEIGHT_TO_INITIAL, fst::kDelta, true); + return true; +} bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { // the following call will do the range-check on 'ilabel'. diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index a94f68ade90..ce755f0cb63 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -50,10 +50,16 @@ struct SupervisionOptions { int32 left_tolerance; int32 right_tolerance; int32 frame_subsampling_factor; + BaseFloat weight; + BaseFloat lm_scale; + BaseFloat phone_ins_penalty; SupervisionOptions(): left_tolerance(5), right_tolerance(5), - frame_subsampling_factor(1) { } + frame_subsampling_factor(1), + weight(1.0), + lm_scale(0.0), + phone_ins_penalty(0.0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -65,6 +71,13 @@ struct SupervisionOptions { "frame-rate of the original alignment. Applied after " "left-tolerance and right-tolerance are applied (so they are " "in terms of the original num-frames."); + opts->Register("weight", &weight, + "Use this to set the supervision weight for training"); + opts->Register("lm-scale", &lm_scale, "The scale with which the graph/lm " + "weights from the phone lattice are included in the " + "supervision fst."); + opts->Register("phone-ins-penalty", &phone_ins_penalty, + "The penalty to penalize longer paths"); } void Check() const; };