diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh index 4a39dfb66ac..2f050be93f2 120000 --- a/egs/swbd/s5c/local/chain/run_tdnn.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_7n.sh \ No newline at end of file +tuning/run_tdnn_7o.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh new file mode 100755 index 00000000000..753dfc632ba --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh @@ -0,0 +1,297 @@ +#!/bin/bash + + +# 7o is as 7n but with a bunch of tuning changes affecting both the structure +# and the learning rates/l2 regularization. Structurally the main change is +# that we also do splicing via an extra layer whose input and output are in the +# "small" dim (256); this increases the left and right context. We also change +# the orthonormal-constraint to be "floating" meaning it doesn't constrain the +# size of the matrix (the value orthonormal-constraint=-1 is interpreted +# specially by the code), which means we can control how fast these constrained +# layers learn layers via l2, just like the unconstrained layers. Also the l2 +# values were increased and the learning rates were decreased; there are +# more epochs (6->8); and the dimension of some of the layers (the ones that +# are subsampled and which don't receive skip-splicing) was increased from +# 1280 to 1536. The config is a bit messy and I'd like to find a way to +# encapsulate things a bit better; treat this as a work in progress. +# +# +# +# local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp +# System tdnn7n_sp tdnn7m26j_sp +# WER on train_dev(tg) 12.18 11.74 +# WER on train_dev(fg) 11.12 10.69 +# WER on eval2000(tg) 14.9 14.6 +# WER on eval2000(fg) 13.5 13.1 +# WER on rt03(tg) 18.4 17.5 +# WER on rt03(fg) 16.2 15.4 +# Final train prob -0.077 -0.070 +# Final valid prob -0.093 -0.084 +# Final train prob (xent) -0.994 -0.883 +# Final valid prob (xent) -1.0194 -0.9110 +# Num-parameters 20111396 22865188 + + +# exp/chain/tdnn7o_sp: num-iters=525 nj=3..16 num-params=22.9M dim=40+100->6034 combine=-0.074->-0.073 (over 7) xent:train/valid[348,524,final]=(-1.20,-0.884,-0.883/-1.24,-0.918,-0.911) logprob:train/valid[348,524,final]=(-0.100,-0.071,-0.070/-0.115,-0.086,-0.084) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7o +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + +# --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + + + steps/nnet3/chain/train.py --stage $train_stage \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 8 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0005 \ + --trainer.optimization.final-effective-lrate 0.00005 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 6fbde1fbbcc..99911b39fb2 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -69,6 +69,7 @@ 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, 'renorm-component': xlayers.XconfigRenormComponent, + 'batchnorm-component': xlayers.XconfigBatchnormComponent, 'no-op-component': xlayers.XconfigNoOpComponent, 'linear-component': xlayers.XconfigLinearComponent } diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 63f6278d1ca..f7da8956d1c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -16,7 +16,7 @@ class XconfigRenormComponent(XconfigLayerBase): """This class is for parsing lines like - 'renorm-component name=renorm input=Append(-3,0,3)' + 'renorm-component name=renorm1 input=Append(-3,0,3)' which will produce just a single component, of type NormalizeComponent. Parameters of the class, and their defaults: @@ -70,9 +70,65 @@ def _generate_config(self): return configs +class XconfigBatchnormComponent(XconfigLayerBase): + """This class is for parsing lines like + 'batchnorm-component name=batchnorm input=Append(-3,0,3)' + which will produce just a single component, of type BatchNormComponent. + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + target-rms=1.0 [The target RMS of the BatchNormComponent] + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'target-rms': 1.0 } + + def check_configs(self): + assert self.config['target-rms'] > 0.0 + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + target_rms = self.config['target-rms'] + + configs = [] + line = ('component name={0} type=BatchNormComponent dim={1} target-rms={2}'.format( + self.name, input_dim, target_rms)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + class XconfigNoOpComponent(XconfigLayerBase): """This class is for parsing lines like - 'no-op-component name=renorm input=Append(-3,0,3)' + 'no-op-component name=noop1 input=Append(-3,0,3)' which will produce just a single component, of type NoOpComponent. Parameters of the class, and their defaults: @@ -127,7 +183,7 @@ class XconfigLinearComponent(XconfigLayerBase): """This class is for parsing lines like 'linear-component name=linear1 dim=1024 input=Append(-3,0,3)' which will produce just a single component, of type LinearComponent, with - output-dim 1024 in this case, and input-dim determined by the dimention + output-dim 1024 in this case, and input-dim determined by the dimension of the input . Parameters of the class, and their defaults: @@ -137,7 +193,7 @@ class XconfigLinearComponent(XconfigLayerBase): The following (shown with their effective defaults) are just passed through to the component's config line. - orthonormal-constraint=-1 + orthonormal-constraint=0.0 max-change=0.75 l2-regularize=0.0