diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh new file mode 100755 index 00000000000..56c1d783a9e --- /dev/null +++ b/egs/tedlium/s5_r3/cmd.sh @@ -0,0 +1,15 @@ +# "queue.pl" uses qsub. The options to it are +# options to qsub. If you have GridEngine installed, +# change this to a queue you have access to. +# Otherwise, use "run.pl", which will run jobs locally +# (make sure your --num-jobs options are no more than +# the number of cpus on your machine. + +# Run locally: +#export train_cmd=run.pl +#export decode_cmd=run.pl + +# JHU cluster (or most clusters using GridEngine, with a suitable +# conf/queue.conf). +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/tedlium/s5_r3/conf/mfcc.conf b/egs/tedlium/s5_r3/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/tedlium/s5_r3/conf/mfcc_hires.conf b/egs/tedlium/s5_r3/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/tedlium/s5_r3/conf/online_cmvn.conf b/egs/tedlium/s5_r3/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..88dde1ff0e2 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} + + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain_cleaned/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=("# WER on dev(orig) " "# WER on dev(rescored) " "# WER on test(orig) " "# WER on test(rescored)") + +for n in 0 1 2 3; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore) + wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore) + wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..40cdcb5b5ff --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,249 @@ +#!/bin/bash + +# Results + +# System tdnn_1a +# Scoring script sclite +# WER on dev(orig) 8.2 +# WER on dev(rescored ngram) 7.6 +# WER on dev(rescored rnnlm) 6.3 +# WER on test(orig) 8.1 +# WER on test(rescored ngram) 7.7 +# WER on test(rescored rnnlm) 6.7 +# Final train prob -0.0802 +# Final valid prob -0.0980 +# Final train prob (xent) -1.1450 +# Final valid prob (xent) -1.2498 +# Num-params 26651840 + + + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism +# to get the configuration. + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=_1a #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 self-repair-scale=1.0e-04 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=1024 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=1024 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=1024 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=1024 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150,110,100 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 6 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..f8eec8c5213 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# run_tdnn_1b.sh is the script which results are presented in the corpus release paper. +# It uses 2 to 6 jobs and add proportional-shrink 10. + +# WARNING +# This script is flawed and misses key elements to optimize the tdnnf setup. +# You can run it as is to reproduce results from the corpus release paper, +# but a more up-to-date version should be looked at in other egs until another +# setup is added here. + +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnn_1b +# System tdnn_1a tdnn_1b tdnn_1b +# Scoring script sclite sclite score_basic +# WER on dev(orig) 8.2 7.9 7.9 +# WER on dev(rescored ngram) 7.6 7.4 7.5 +# WER on dev(rescored rnnlm) 6.3 6.2 6.2 +# WER on test(orig) 8.1 8.0 8.2 +# WER on test(rescored ngram) 7.7 7.7 7.9 +# WER on test(rescored rnnlm) 6.7 6.7 6.8 +# Final train prob -0.0802 -0.0899 +# Final valid prob -0.0980 -0.0974 +# Final train prob (xent) -1.1450 -0.9449 +# Final valid prob (xent) -1.2498 -1.0002 +# Num-params 26651840 25782720 + + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnnf_affix=_1a #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1280 + linear-component name=tdnn2l dim=256 input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 + relu-batchnorm-layer name=tdnn3 dim=1280 + linear-component name=tdnn4l dim=256 input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 + relu-batchnorm-layer name=tdnn5 dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 + relu-batchnorm-layer name=prefinal-chain input=prefinal-l dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets + relu-batchnorm-layer name=prefinal-xent input=prefinal-l dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 6 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh new file mode 100755 index 00000000000..49de5b12372 --- /dev/null +++ b/egs/tedlium/s5_r3/local/download_data.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 John Hopkins University (author: Daniel Povey) +# Apache 2.0 + +mkdir -p db + +cd db ### Note: the rest of this script is executed from the directory 'db'. + +# TED-LIUM database: +if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then + if [ ! -e TEDLIUM_release-3 ]; then + ln -sf /export/corpora5/TEDLIUM_release-3 + fi + echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3" +else + if [ ! -e TEDLIUM_release-3 ]; then + echo "$0: downloading TEDLIUM_release-3 data (it won't re-download if it was already downloaded.)" + # the following command won't re-get it if it's already there + # because of the --continue switch. + wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1 + tar xf "TEDLIUM_release-3.tar.gz" + else + echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists." + fi +fi + + +num_sph=$(find TEDLIUM_release-3/data -name '*.sph' | wc -l) +if [ "$num_sph" != 2351 ]; then + echo "$0: expected to find 2351 .sph files in the directory db/TEDLIUM_release-3, found $num_sph" + exit 1 +fi + +exit 0 + diff --git a/egs/tedlium/s5_r3/local/format_lms.sh b/egs/tedlium/s5_r3/local/format_lms.sh new file mode 100755 index 00000000000..bba5bbd17ec --- /dev/null +++ b/egs/tedlium/s5_r3/local/format_lms.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# Apache 2.0 + +if [ -f path.sh ]; then . path.sh; fi + + +small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz +big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz + +for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + + +set -e + +if [ -f data/lang_nosp/G.fst ] && [ data/lang_nosp/G.fst -nt $small_arpa_lm ]; then + echo "$0: not regenerating data/lang_nosp/G.fst as it already exists and " + echo ".. is newer than the source LM." +else + arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \ + "gunzip -c $small_arpa_lm|" data/lang_nosp/G.fst + echo "$0: Checking how stochastic G is (the first of these numbers should be small):" + fstisstochastic data/lang_nosp/G.fst || true + utils/validate_lang.pl --skip-determinization-check data/lang_nosp +fi + + + +if [ -f data/lang_nosp_rescore/G.carpa ] && [ data/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \ + [ data/lang_nosp_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then + echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date." +else + utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp data/lang_nosp_rescore || exit 1; +fi + +exit 0; diff --git a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..5322da6240f --- /dev/null +++ b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh @@ -0,0 +1,184 @@ +#!/bin/bash + +set -e -o pipefail + + +# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually +# be called by more scripts). It contains the common feature preparation and iVector-related parts +# of the script. See those scripts for examples of usage. + + +stage=0 +nj=30 + +train_set=train_cleaned # you might set this to e.g. train. +gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it + # becomes exp/nnet3_cleaned or whatever. + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 2." + exit 1 +fi + + +if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp dev test; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp dev test; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 3 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + # train a diagonal UBM using a subset of about a quarter of the data + num_utts_total=$(wc -l -> : map dev stm labels to be coherent with train + test, + # - -> : --||-- + # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary + # - -> null : remove marked , it is modelled implicitly (in kaldi) + # - (...) -> null : remove utterance names from end-lines of train + # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py) + { # Add STM header, so sclite can prepare the '.lur' file + echo ';; +;; LABEL "o" "Overall" "Overall results" +;; LABEL "f0" "f0" "Wideband channel" +;; LABEL "f2" "f2" "Telephone channel" +;; LABEL "male" "Male" "Male Talkers" +;; LABEL "female" "Female" "Female Talkers" +;;' + # Process the STMs + cat db/TEDLIUM_release-3/legacy/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \ + sed -e 's:([^ ]*)$::' | \ + awk '{ $2 = "A"; print $0; }' + } | local/join_suffix.py > data/$set.orig/stm + + # Prepare 'text' file + # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary + cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \ + awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100); + for (i=7;i<=NF;i++) { printf(" %s", $i); } + printf("\n"); + }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1 + + # Prepare 'segments', 'utt2spk', 'spk2utt' + cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments + cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk + cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt + + # Prepare 'wav.scp', 'reco2file_and_channel' + cat $dir/spk2utt | awk -v set=$set -v pwd=$PWD '{ printf("%s sph2pipe -f wav -p %s/db/TEDLIUM_release-3/legacy/%s/sph/%s.sph |\n", $1, pwd, set, $1); }' > $dir/wav.scp + cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel + + # Create empty 'glm' file + echo ';; empty.glm + [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token + ' > data/$set.orig/glm + + # The training set seems to not have enough silence padding in the segmentations, + # especially at the beginning of segments. Extend the times. + if [ $set == "train" ]; then + mv data/$set.orig/segments data/$set.orig/segments.temp + utils/data/extend_segment_times.py --start-padding=0.15 \ + --end-padding=0.1 data/$set.orig/segments || exit 1 + rm data/$set.orig/segments.temp + fi + + # Check that data dirs are okay! + utils/validate_data_dir.sh --no-feats $dir || exit 1 +done + diff --git a/egs/tedlium/s5_r3/local/prepare_dict.sh b/egs/tedlium/s5_r3/local/prepare_dict.sh new file mode 100755 index 00000000000..204b3f910e5 --- /dev/null +++ b/egs/tedlium/s5_r3/local/prepare_dict.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Daniel Galvez +# 2016 Vincent Nguyen +# Apache 2.0 +# + +dir=data/local/dict_nosp +mkdir -p $dir + +srcdict=db/TEDLIUM_release-3/TEDLIUM.152k.dic + +[ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 + +# Join dicts and fix some troubles +cat $srcdict | grep -v -w "" | grep -v -w "" | grep -v -w "" | \ + LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt + +cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ + grep -v SIL | sort > $dir/nonsilence_phones.txt + +( echo SIL; echo NSN ) > $dir/silence_phones.txt + +echo SIL > $dir/optional_silence.txt + +# No "extra questions" in the input to this setup, as we don't +# have stress or tone. +echo -n >$dir/extra_questions.txt + +# Add to the lexicon the silences, noises etc. +# Typically, you would use " NSN" here, but the Cantab Research language models +# use instead of to represent out of vocabulary words. +echo ' NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt + +# Check that the dict dir is okay! +utils/validate_dict_dir.pl $dir || exit 1 diff --git a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh new file mode 100755 index 00000000000..61ad07645ff --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# +# Copyright 2018 François Hernandez (Ubiqus) +# +# This script takes a rnnlm_dir and averages its models. +# +# Takes the default rnnlm_dir of tedlium s5_r3 recipe, +# and average the best model and the 10 previous and +# following ones (if they exist). + + +. ./cmd.sh +. ./path.sh + +set -e -o pipefail -u + +rnnlm_dir=exp/rnnlm_lstm_tdnn_a +begin= +end= + +. utils/parse_options.sh # accept options + +# get the best iteration +best_iter=$(rnnlm/get_best_model.py $rnnlm_dir) + +# get num_iters +info=$(grep "num_iters" $rnnlm_dir/info.txt) +num_iters=${info##*=} + + +# test if begin and end exist +if [ -z $begin ] && [ -z $end ]; then + begin=$(($best_iter-10)) + end=$(($best_iter+10)) + if [ $begin -le 1 ]; then + begin=1 + fi + if [ ! $end -le $num_iters ]; then + end=$num_iters + fi +fi + +# create list of models and embeddings files to merge +models="" +embeddings="" +for num in $(seq -s' ' $begin $end); do + [ -f $rnnlm_dir/$num.raw ] && \ + models=$models" $rnnlm_dir/$num.raw" + [ -f $rnnlm_dir/feat_embedding.$num.mat ] && \ + embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat" +done + +# merge list of files +mkdir -p ${rnnlm_dir}_averaged +nnet3-average $models ${rnnlm_dir}_averaged/final.raw +matrix-sum --average=true $embeddings ${rnnlm_dir}_averaged/feat_embedding.final.mat + +# copy other files to averaged rnnlm_dir +cp -r $rnnlm_dir/{info.txt,word_feats.txt,config,special_symbol_opts.txt} ${rnnlm_dir}_averaged + diff --git a/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh new file mode 100755 index 00000000000..ba6252450da --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# To be run from the egs/ directory. + +. path.sh + +set -e -o pipefail -u + +# it should contain things like +# foo.txt, bar.txt, and dev.txt (dev.txt is a special filename that's +# obligatory). +data_dir=data/rnnlm +dir=exp/rnnlm/ +mkdir -p $dir + +# validata data dir +rnnlm/validate_data_dir.py $data_dir/data/ + +# get unigram counts +rnnlm/get_unigram_counts.sh $data_dir/data/ + +# get vocab +mkdir -p $data_dir/vocab +rnnlm/get_vocab.py $data_dir/data > $data_dir/vocab/words.txt + +# Choose weighting and multiplicity of data. +# The following choices would mean that data-source 'foo' +# is repeated once per epoch and has a weight of 0.5 in the +# objective function when training, and data-source 'bar' is repeated twice +# per epoch and has a data -weight of 1.5. +# There is no contraint that the average of the data weights equal one. +# Note: if a data-source has zero multiplicity, it just means you are ignoring +# it; but you must include all data-sources. +#cat > exp/foo/data_weights.txt < $dir/data_weights.txt < $dir/unigram_probs.txt + +# choose features +rnnlm/choose_features.py --unigram-probs=$dir/unigram_probs.txt \ + $data_dir/vocab/words.txt > $dir/features.txt +# validate features +rnnlm/validate_features.py $dir/features.txt + +# make features for word +rnnlm/make_word_features.py --unigram-probs=$dir/unigram_probs.txt \ + $data_dir/vocab/words.txt $dir/features.txt \ + > $dir/word_feats.txt + +# validate word features +rnnlm/validate_word_features.py --features-file $dir/features.txt \ + $dir/word_feats.txt diff --git a/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh new file mode 120000 index 00000000000..72a3172db41 --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh @@ -0,0 +1 @@ +tuning/run_lstm_tdnn_a.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh new file mode 100755 index 00000000000..32252db937d --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson +# 2017 Hainan Xu +# 2017 Ke Li +# 2018 François Hernandez (Ubiqus) +# +# rnnlm/train_rnnlm.sh: best iteration (out of 1060) was 1050, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 90.0 / 92.0. + +# System tdnn_1a tdnnf_1a +# WER on dev(orig) 8.2 7.9 +# WER on dev(ngram) 7.6 7.2 +# WER on dev(rnnlm) 6.3 6.1 +# WER on test(orig) 8.1 8.0 +# WER on test(ngram) 7.7 7.5 +# WER on test(rnnlm) 6.7 6.6 + +# Begin configuration section. +dir=exp/rnnlm_lstm_tdnn_a +embedding_dim=800 +lstm_rpd=200 +lstm_nrpd=200 +stage=-10 +train_stage=-10 +epochs=20 + +. ./cmd.sh +. utils/parse_options.sh +[ -z "$cmd" ] && cmd=$train_cmd + +text_from_audio=data/train/text +text=data/LM/train.txt +wordlist=data/lang_chain/words.txt +dev_sents=10000 +text_dir=data/rnnlm/text +mkdir -p $dir/config +set -e + +for f in $text $wordlist; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for local/prepare_data.sh and utils/prepare_lang.sh in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + # shuffle text from audio and lm + cat $text_from_audio | cut -d ' ' -f2- | cat $text |\ + shuf > data/rnnlm/full_lm_data.shuffled + # create dev and train sets based on audio and LM data + cat data/rnnlm/full_lm_data.shuffled | head -n $dev_sents> $text_dir/dev.txt + cat data/rnnlm/full_lm_data.shuffled | tail -n +$[$dev_sents+1] > $text_dir/ted.txt + +fi + +if [ $stage -le 1 ]; then + cp $wordlist $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --top-word-features=10000 \ + --min-frequency 1.0e-03 \ + --special-words=',,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig < " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text > $dir/scoring/test_filt.txt + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ + lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \ + "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1; + +# Note: the double level of quoting for the sed command + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ + cat $dir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| \ + sed "'s:::g'" \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; + +# Show results +for f in $dir/wer_*; do echo $f; egrep '(WER)|(SER)' < $f; done + +exit 0; diff --git a/egs/tedlium/s5_r3/local/score_sclite.sh b/egs/tedlium/s5_r3/local/score_sclite.sh new file mode 100755 index 00000000000..16c8b30e52f --- /dev/null +++ b/egs/tedlium/s5_r3/local/score_sclite.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012, +# Brno University of Technology (Author: Karel Vesely) 2014, +# Apache 2.0 +# + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +beam=7 # speed-up, but may affect MBR confidences. +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +dir=$3 + +model=$dir/../$iter.mdl # assume model one level up from decoding dir. + +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; +hubdir=`dirname $hubscr` + +for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \ + $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; +done + +# name=`basename $data`; # e.g. eval2000 +nj=$(cat $dir/num_jobs) + +mkdir -p $dir/scoring/log + +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + +if [ $stage -le 0 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + set -e -o pipefail \; \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-align-words --output-error-lats=true --max-expand=10.0 --test=false \ + $lang/phones/word_boundary.int $model ark:- ark:- \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \| \ + sort -k1,1 -k2,2 -k3,3nb '>' $dir/score_LMWT_${wip}/ctm || exit 1; + done +fi + +if [ $stage -le 1 ]; then + # Remove some stuff we don't want to score, from the ctm. + for x in $dir/score_*/ctm; do + # `-i` is not needed in the following. It is added for robustness in ase this code is copy-pasted + # into another script that, e.g., uses instead of + grep -v -w -i '' <$x > ${x}.filt || exit 1; + done +fi + +# Score the set... +if [ $stage -le 2 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ + cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/ctm.filt || exit 1; + done +fi + +exit 0 diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh new file mode 100755 index 00000000000..ad833555b5f --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_download_lm.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# Copyright 2018 David Snyder +# Apache 2.0 +# +# This script downloads pre-built language models trained on the Cantab-Tedlium +# text data and Tedlium acoustic training data. If you want to build these +# models yourself, run the script local/ted_train_lm.sh. + +set -e + +echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)" +wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P data/local/local_lm/data/arpa || exit 1 +wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P data/local/local_lm/data/arpa || exit 1 + +exit 0 \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh new file mode 100755 index 00000000000..431d44c6ff6 --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# +# Copyright 2018 François Hernandez +# Apache 2.0 +# +# This script downloads pre-built RNN language models trained on the TED-LIUM +# text data and acoustic training data. If you want to build these +# models yourself, run the script local/ted_train_rnnlm.sh. + +set -e + +echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)" +wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1 +cd exp/rnnlm_lstm_tdnn_a_averaged +tar -xvzf tedlium_rnnlm.tgz || exit 1 +rm tedlium_rnnlm.tgz +mkdir config +cd ../.. +cp data/lang/words.txt exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt +echo " 152217" >> exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt + +exit 0 diff --git a/egs/tedlium/s5_r3/local/ted_train_lm.sh b/egs/tedlium/s5_r3/local/ted_train_lm.sh new file mode 100755 index 00000000000..3c587f63094 --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_train_lm.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 +# +# This script trains a LM on the Cantab-Tedlium text data and tedlium acoustic training data. +# It is based on the example scripts distributed with PocoLM + +# It will first check if pocolm is installed and if not will process with installation +# It will then get the source data from the pre-downloaded Cantab-Tedlium files +# and the pre-prepared data/train text source. + + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +num_dev_sentences=10000 + +#bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Unzip TEDLIUM 6 data sources, remove , gzip the result. + gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c > ${dir}/data/text/train.txt.gz + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + head -n $num_dev_sentences < data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + # .. and the rest of the training data as an additional data source. + # we can later fold the dev data into this. + tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/ted.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/ted.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/dev/text > ${dir}/data/real_dev_set.txt + + # get wordlist + awk '{print $1}' db/TEDLIUM_release-3/TEDLIUM.152k.dic | sed 's:([0-9])::g' | sort | uniq > ${dir}/data/wordlist +fi + +order=4 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 ted=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=ted ${bypass_metaparam_optim_opt} \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #[perplexity = 157.87] over 18290.0 words +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 10 million n-grams for a big LM for rescoring purposes. + size=10000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true: + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words. + + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 2 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=2000000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words. + + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/tedlium/s5_r3/path.sh b/egs/tedlium/s5_r3/path.sh new file mode 100755 index 00000000000..16d5314b9c2 --- /dev/null +++ b/egs/tedlium/s5_r3/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/tedlium/s5_r3/results.sh b/egs/tedlium/s5_r3/results.sh new file mode 100755 index 00000000000..98bcab94ec5 --- /dev/null +++ b/egs/tedlium/s5_r3/results.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +filter_regexp=. +[ $# -ge 1 ] && filter_regexp=$1 + +for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null + for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp + for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp +exit 0 + diff --git a/egs/tedlium/s5_r3/rnnlm b/egs/tedlium/s5_r3/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/tedlium/s5_r3/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh new file mode 100755 index 00000000000..d4f3a38fd49 --- /dev/null +++ b/egs/tedlium/s5_r3/run.sh @@ -0,0 +1,224 @@ +#!/bin/bash +# +# Based mostly on the Switchboard recipe. The training database is TED-LIUM, +# it consists of TED talks with cleaned automatic transcripts: +# +# https://lium.univ-lemans.fr/ted-lium3/ +# http://www.openslr.org/resources (Mirror). +# +# The data is distributed under 'Creative Commons BY-NC-ND 3.0' license, +# which allow free non-commercial use, while only a citation is required. +# +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Vincent Nguyen +# 2016 Johns Hopkins University (Author: Daniel Povey) +# 2018 François Hernandez +# +# Apache 2.0 +# + +. ./cmd.sh +. ./path.sh + + +set -e -o pipefail -u + +nj=35 +decode_nj=30 # note: should not be >38 which is the number of speakers in the dev set + # after applying --seconds-per-spk-max 180. We decode with 4 threads, so + # this will be too many jobs if you're using run.pl. +stage=0 +train_rnnlm=false +train_lm=false + +. utils/parse_options.sh # accept options + +# Data preparation +if [ $stage -le 0 ]; then + local/download_data.sh +fi + +if [ $stage -le 1 ]; then + local/prepare_data.sh + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + # [we chose 3 minutes because that gives us 38 speakers for the dev data, which is + # more than our normal 30 jobs.] + for dset in dev test train; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset} + done +fi + + +if [ $stage -le 2 ]; then + local/prepare_dict.sh +fi + +if [ $stage -le 3 ]; then + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_nosp data/lang_nosp +fi + +if [ $stage -le 4 ]; then + # later on we'll change this script so you have the option to + # download the pre-built LMs from openslr.org instead of building them + # locally. + if $train_lm; then + local/ted_train_lm.sh + else + local/ted_download_lm.sh + fi +fi + +if [ $stage -le 5 ]; then + local/format_lms.sh +fi + +# Feature extraction +if [ $stage -le 6 ]; then + for set in test dev train; do + dir=data/$set + steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir + steps/compute_cmvn_stats.sh $dir + done +fi + +# Now we have 452 hours of training data. +# Well create a subset with 10k short segments to make flat-start training easier: +if [ $stage -le 7 ]; then + utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort + utils/data/remove_dup_utts.sh 10 data/train_10kshort data/train_10kshort_nodup +fi + +# Train +if [ $stage -le 8 ]; then + steps/train_mono.sh --nj 20 --cmd "$train_cmd" \ + data/train_10kshort_nodup data/lang_nosp exp/mono +fi + +if [ $stage -le 9 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/mono exp/mono_ali + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/train data/lang_nosp exp/mono_ali exp/tri1 +fi + +if [ $stage -le 10 ]; then + utils/mkgraph.sh data/lang_nosp exp/tri1 exp/tri1/graph_nosp + + # The slowest part about this decoding is the scoring, which we can't really + # control as the bottleneck is the NIST tools. + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri1/graph_nosp data/${dset} exp/tri1/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ + data/${dset} exp/tri1/decode_nosp_${dset} exp/tri1/decode_nosp_${dset}_rescore + done +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/train data/lang_nosp exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 12 ]; then + utils/mkgraph.sh data/lang_nosp exp/tri2 exp/tri2/graph_nosp + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph_nosp data/${dset} exp/tri2/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ + data/${dset} exp/tri2/decode_nosp_${dset} exp/tri2/decode_nosp_${dset}_rescore + done +fi + +if [ $stage -le 13 ]; then + steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp exp/tri2 + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ + exp/tri2/sil_counts_nowb.txt \ + exp/tri2/pron_bigram_counts_nowb.txt data/local/dict +fi + +if [ $stage -le 14 ]; then + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + cp -rT data/lang data/lang_rescore + cp data/lang_nosp/G.fst data/lang/ + cp data/lang_nosp_rescore/G.carpa data/lang_rescore/ + + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph + + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph data/${dset} exp/tri2/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} exp/tri2/decode_${dset} exp/tri2/decode_${dset}_rescore + done +fi + +if [ $stage -le 15 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/train data/lang exp/tri2_ali exp/tri3 + + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph + + for dset in dev test; do + steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph data/${dset} exp/tri3/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} exp/tri3/decode_${dset} exp/tri3/decode_${dset}_rescore + done +fi + +if [ $stage -le 16 ]; then + # this does some data-cleaning. It actually degrades the GMM-level results + # slightly, but the cleaned data should be useful when we add the neural net and chain + # systems. If not we'll remove this stage. + local/run_cleanup_segmentation.sh +fi + +if [ $stage -le 17 ]; then + # This will only work if you have GPUs on your system (and note that it requires + # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html) + local/chain/run_tdnnf.sh +fi + +if [ $stage -le 18 ]; then + # You can either train your own rnnlm or download a pre-trained one + if $train_rnnlm; then + local/rnnlm/tuning/run_lstm_tdnn_a.sh + local/rnnlm/average_rnnlm.sh + else + local/ted_download_rnnlm.sh + fi +fi + +if [ $stage -le 19 ]; then + # Here we rescore the lattices generated at stage 17 + rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged + lang_dir=data/lang_chain + ngram_order=4 + + for dset in dev test; do + data_dir=data/${dset}_hires + decoding_dir=exp/chain_cleaned/tdnnf_1a + suffix=$(basename $rnnlm_dir) + output_dir=${decoding_dir}_$suffix + + rnnlm/lmrescore_pruned.sh \ + --cmd "$decode_cmd --mem 4G" \ + --weight 0.5 --max-ngram-order $ngram_order \ + $lang_dir $rnnlm_dir \ + $data_dir $decoding_dir \ + $output_dir + done +fi + +echo "$0: success." +exit 0 diff --git a/egs/tedlium/s5_r3/steps b/egs/tedlium/s5_r3/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/tedlium/s5_r3/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/tedlium/s5_r3/utils b/egs/tedlium/s5_r3/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/tedlium/s5_r3/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file