From 7f3d44e8e5c872733d700d82339019dd1615398e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 4 May 2018 16:16:15 +0200 Subject: [PATCH 01/35] initial commit run.sh s5_r3 WIP --- egs/tedlium/s5_r3/run.sh | 222 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100755 egs/tedlium/s5_r3/run.sh diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh new file mode 100755 index 00000000000..7147476fe52 --- /dev/null +++ b/egs/tedlium/s5_r3/run.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# +# Based mostly on the Switchboard recipe. The training database is TED-LIUM, +# it consists of TED talks with cleaned automatic transcripts: +# +# http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus +# http://www.openslr.org/resources (Mirror). +# +# The data is distributed under 'Creative Commons BY-NC-ND 3.0' license, +# which allow free non-commercial use, while only a citation is required. +# +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Vincent Nguyen +# 2016 Johns Hopkins University (Author: Daniel Povey) +# 2018 François Hernandez +# +# Apache 2.0 +# + +. ./cmd.sh +. ./path.sh + + +set -e -o pipefail -u + +nj=35 +decode_nj=30 # note: should not be >38 which is the number of speakers in the dev set + # after applying --seconds-per-spk-max 180. We decode with 4 threads, so + # this will be too many jobs if you're using run.pl. +stage=0 +train_rnnlm=true + +. utils/parse_options.sh # accept options + +# Data preparation +if [ $stage -le 0 ]; then + local/download_data.sh +fi + +if [ $stage -le 1 ]; then + local/prepare_data.sh + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + # [we chose 3 minutes because that gives us 38 speakers for the dev data, which is + # more than our normal 30 jobs.] + for dset in dev test train; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset} + done +fi + + +if [ $stage -le 2 ]; then + local/prepare_dict.sh +fi + +if [ $stage -le 3 ]; then + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_nosp data/lang_nosp +fi + +if [ $stage -le 4 ]; then + # later on we'll change this script so you have the option to + # download the pre-built LMs from openslr.org instead of building them + # locally. + local/ted_train_lm.sh +fi + +if [ $stage -le 5 ]; then + local/format_lms.sh +fi + +# Feature extraction +if [ $stage -le 6 ]; then + for set in test dev train; do + dir=data/$set + steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir + steps/compute_cmvn_stats.sh $dir + done +fi + +# Now we have 452 hours of training data. +# Well create a subset with 10k short segments to make flat-start training easier: +if [ $stage -le 7 ]; then + utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort + utils/data/remove_dup_utts.sh 10 data/train_10kshort data/train_10kshort_nodup +fi + +# Train +if [ $stage -le 8 ]; then + steps/train_mono.sh --nj 20 --cmd "$train_cmd" \ + data/train_10kshort_nodup data/lang_nosp exp/mono +fi + +if [ $stage -le 9 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/mono exp/mono_ali + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/train data/lang_nosp exp/mono_ali exp/tri1 +fi + +if [ $stage -le 10 ]; then + utils/mkgraph.sh data/lang_nosp exp/tri1 exp/tri1/graph_nosp + + # The slowest part about this decoding is the scoring, which we can't really + # control as the bottleneck is the NIST tools. + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri1/graph_nosp data/${dset} exp/tri1/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ + data/${dset} exp/tri1/decode_nosp_${dset} exp/tri1/decode_nosp_${dset}_rescore + done +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/train data/lang_nosp exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 12 ]; then + utils/mkgraph.sh data/lang_nosp exp/tri2 exp/tri2/graph_nosp + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph_nosp data/${dset} exp/tri2/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ + data/${dset} exp/tri2/decode_nosp_${dset} exp/tri2/decode_nosp_${dset}_rescore + done +fi + +if [ $stage -le 13 ]; then + steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp exp/tri2 + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ + exp/tri2/sil_counts_nowb.txt \ + exp/tri2/pron_bigram_counts_nowb.txt data/local/dict +fi + +if [ $stage -le 14 ]; then + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + cp -rT data/lang data/lang_rescore + cp data/lang_nosp/G.fst data/lang/ + cp data/lang_nosp_rescore/G.carpa data/lang_rescore/ + + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph + + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph data/${dset} exp/tri2/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} exp/tri2/decode_${dset} exp/tri2/decode_${dset}_rescore + done +fi + +if [ $stage -le 15 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/train data/lang exp/tri2_ali exp/tri3 + + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph + + for dset in dev test; do + steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph data/${dset} exp/tri3/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} exp/tri3/decode_${dset} exp/tri3/decode_${dset}_rescore + done +fi + + +if [ $stage -le 16 ]; then + # this does some data-cleaning. It actually degrades the GMM-level results + # slightly, but the cleaned data should be useful when we add the neural net and chain + # systems. If not we'll remove this stage. + local/run_cleanup_segmentation.sh +fi + + +if [ $stage -le 17 ]; then + # This will only work if you have GPUs on your system (and note that it requires + # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html) + local/chain/tuning/run_tdnn_PR2114.sh +fi + + +if [ $stage -le 18 ]; then + # todo add option to choose between training and downloading + if $train_rnnlm; then + local/rnnlm/tuning/run_lstm_tdnn_a.sh + local/rnnlm/average_rnnlm.sh + fi +fi + + +if [ $stage -le 19 ]; then + # Here we rescore the lattices generated at stage 17 + rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged + lang_dir=data/lang_chain + ngram_order=4 + + for set in dev test; do + data_dir=data/${set}_hires + decoding_dir=exp/chain/ # TODO path to tdnn dev and test decoding dirs + suffix=$(basename $rnnlm_dir) + output_dir=${decoding_dir}_$suffix + + rnnlm/lmrescore_pruned.sh \ + --cmd "$decode_cmd --mem 4G" \ + --weight 0.5 --max-ngram-order $ngram_order \ + $lang_dir $rnnlm_dir \ + $data_dir $decoding_dir \ + $output_dir + done +fi + + +echo "$0: success." +exit 0 From cd00b07120d05aad3f1e27054a1013baa1670f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 4 May 2018 16:19:57 +0200 Subject: [PATCH 02/35] add links and conf dir s5_r3 --- egs/tedlium/s5_r3/conf/decode.config | 1 + egs/tedlium/s5_r3/conf/decode_dnn.config | 2 ++ egs/tedlium/s5_r3/conf/fbank.conf | 5 +++++ egs/tedlium/s5_r3/conf/mfcc.conf | 2 ++ egs/tedlium/s5_r3/conf/mfcc_hires.conf | 10 ++++++++++ egs/tedlium/s5_r3/conf/no_k20.conf | 13 +++++++++++++ egs/tedlium/s5_r3/conf/online_cmvn.conf | 1 + egs/tedlium/s5_r3/conf/pitch.conf | 2 ++ egs/tedlium/s5_r3/rnnlm | 1 + egs/tedlium/s5_r3/steps | 1 + egs/tedlium/s5_r3/utils | 1 + 11 files changed, 39 insertions(+) create mode 100644 egs/tedlium/s5_r3/conf/decode.config create mode 100644 egs/tedlium/s5_r3/conf/decode_dnn.config create mode 100644 egs/tedlium/s5_r3/conf/fbank.conf create mode 100644 egs/tedlium/s5_r3/conf/mfcc.conf create mode 100644 egs/tedlium/s5_r3/conf/mfcc_hires.conf create mode 100644 egs/tedlium/s5_r3/conf/no_k20.conf create mode 100644 egs/tedlium/s5_r3/conf/online_cmvn.conf create mode 100644 egs/tedlium/s5_r3/conf/pitch.conf create mode 120000 egs/tedlium/s5_r3/rnnlm create mode 120000 egs/tedlium/s5_r3/steps create mode 120000 egs/tedlium/s5_r3/utils diff --git a/egs/tedlium/s5_r3/conf/decode.config b/egs/tedlium/s5_r3/conf/decode.config new file mode 100644 index 00000000000..7ba966f2b83 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/decode.config @@ -0,0 +1 @@ +# empty config, just use the defaults. diff --git a/egs/tedlium/s5_r3/conf/decode_dnn.config b/egs/tedlium/s5_r3/conf/decode_dnn.config new file mode 100644 index 00000000000..ab8dcc1dc08 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/decode_dnn.config @@ -0,0 +1,2 @@ +beam=13.0 # beam for decoding. Was 13.0 in the scripts. +lattice_beam=8.0 # this has most effect on size of the lattices. diff --git a/egs/tedlium/s5_r3/conf/fbank.conf b/egs/tedlium/s5_r3/conf/fbank.conf new file mode 100644 index 00000000000..4c57f8a8765 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/fbank.conf @@ -0,0 +1,5 @@ +--window-type=hamming # disable Dans window, use the standard +--use-energy=false # only fbank outputs +--dither=1 +--num-mel-bins=40 # 8 filters/octave, 40 filters/16Khz as used by IBM +--htk-compat=true # try to make it compatible with HTK diff --git a/egs/tedlium/s5_r3/conf/mfcc.conf b/egs/tedlium/s5_r3/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/tedlium/s5_r3/conf/mfcc_hires.conf b/egs/tedlium/s5_r3/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/tedlium/s5_r3/conf/no_k20.conf b/egs/tedlium/s5_r3/conf/no_k20.conf new file mode 100644 index 00000000000..f0cba4df971 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/no_k20.conf @@ -0,0 +1,13 @@ +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q +option gpu=* -l gpu=$0 -q g.q +default allow_k20=true +option allow_k20=true +option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' diff --git a/egs/tedlium/s5_r3/conf/online_cmvn.conf b/egs/tedlium/s5_r3/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/tedlium/s5_r3/conf/pitch.conf b/egs/tedlium/s5_r3/conf/pitch.conf new file mode 100644 index 00000000000..bba51335be3 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/pitch.conf @@ -0,0 +1,2 @@ +--nccf-ballast-online=true # helps for online operation. + diff --git a/egs/tedlium/s5_r3/rnnlm b/egs/tedlium/s5_r3/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/tedlium/s5_r3/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/tedlium/s5_r3/steps b/egs/tedlium/s5_r3/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/tedlium/s5_r3/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/tedlium/s5_r3/utils b/egs/tedlium/s5_r3/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/tedlium/s5_r3/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file From 769809cfcaf7173463a648c073db52a624a6859c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 4 May 2018 16:36:42 +0200 Subject: [PATCH 03/35] add tdnnf best result script TODO header --- .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 252 ++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh new file mode 100755 index 00000000000..9cf4e00a0b3 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh @@ -0,0 +1,252 @@ +#!/bin/bash + +# TODO clean this header !!! +# run_tdnn_1f.sh is like run_tdnn_1e.sh but it use 2 to 6 jobs and add proportional-shrink 20. + +#exp/chain_cleaned/tdnn1e_sp_bi/: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.095->-0.095 xent:train/valid[167,252,final]=(-1.37,-1.31,-1.31/-1.47,-1.44,-1.44) logprob:train/valid[167,252,final]=(-0.087,-0.078,-0.078/-0.102,-0.099,-0.099) +#exp/chain_cleaned/tdnn1f_sp_bi/: num-iters=444 nj=2..6 num-params=7.0M dim=40+100->3603 combine=-0.114->-0.113 xent:train/valid[295,443,final]=(-1.59,-1.51,-1.49/-1.58,-1.52,-1.50) logprob:train/valid[295,443,final]=(-0.112,-0.102,-0.098/-0.122,-0.113,-0.110) + +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1d_sp_bi exp/chain_cleaned/tdnn1e_sp_bi +# System tdnn1e_sp_bi tdnn1f_sp_bi +# WER on dev(orig) 9.2 9.0 +# WER on dev(rescored) 8.6 8.2 +# WER on test(orig) 9.4 9.1 +# WER on test(rescored) 8.9 8.7 +# Final train prob -0.0776 -0.0983 +# Final valid prob -0.0992 -0.1103 +# Final train prob (xent) -1.3110 -1.4893 +# Final valid prob (xent) -1.4353 -1.4951 + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism +# to get the configuration. + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=17 +nj=30 +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1a #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 6 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 From 0bd925430b9a7ba126bf4d1b24708f58950ad4c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 4 May 2018 16:38:57 +0200 Subject: [PATCH 04/35] add some rnnlm scripts WIP --- .../s5_r3/local/rnnlm/average_rnnlm.sh | 57 ++++++++++ .../s5_r3/local/rnnlm/prepare_rnnlm_data.sh | 61 +++++++++++ .../local/rnnlm/tuning/run_lstm_tdnn_a.sh | 101 ++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh create mode 100755 egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh create mode 100755 egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh diff --git a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh new file mode 100755 index 00000000000..9ae9307d93d --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +# Copyright 2018 François Hernandez (Ubiqus) +# +# This script takes a rnnlm_dir and averages its models. +# +# Takes the default rnnlm_dir of tedlium s5_r3 recipe, +# and average the best model and the 10 previous and +# following ones (if they exist). + + +. ./cmd.sh +. ./path.sh + +set -e -o pipefail -u + +rnnlm_dir=exp/rnnlm_lstm_tdnn_a +begin= +end= + +. utils/parse_options.sh # accept options + +# get the best iteration +best_iter=$(rnnlm/get_best_model.py $dir) + +# get num_iters +info=$(grep "num_iters" $rnnlm_dir/info.txt) +num_iters=${info##*=} + + +# test if begin and end exist +if [ -z $begin ] && [ -z $end ]; then + begin=$(($best_iter-10)) + end=$(($best_iter+10)) + if [ $begin -le 1 ]; then + begin=1 + fi + if [ ! $end -le $num_iters ]; then + end=$num_iters + fi +fi + +# create list of models and embeddings files to merge +models="" +embeddings="" +for num in $(seq -s' ' $begin $end); do + models=$models" $rnnlm_dir/$num.raw" + embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat" +done + +# merge list of files +nnet3-average $models ${rnnlm_dir}_averaged/final.raw +matrix-sum --average=true $embeddings ${rnnlm_dir}_averaged/feat_embedding.final.mat + +# copy other files to averaged rnnlm_dir +cp -r $rnnlm_dir/{info.txt,word_feats.txt,config,special_symbol_opts.txt} ${rnnlm_dir}_averaged + diff --git a/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh new file mode 100755 index 00000000000..ba6252450da --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# To be run from the egs/ directory. + +. path.sh + +set -e -o pipefail -u + +# it should contain things like +# foo.txt, bar.txt, and dev.txt (dev.txt is a special filename that's +# obligatory). +data_dir=data/rnnlm +dir=exp/rnnlm/ +mkdir -p $dir + +# validata data dir +rnnlm/validate_data_dir.py $data_dir/data/ + +# get unigram counts +rnnlm/get_unigram_counts.sh $data_dir/data/ + +# get vocab +mkdir -p $data_dir/vocab +rnnlm/get_vocab.py $data_dir/data > $data_dir/vocab/words.txt + +# Choose weighting and multiplicity of data. +# The following choices would mean that data-source 'foo' +# is repeated once per epoch and has a weight of 0.5 in the +# objective function when training, and data-source 'bar' is repeated twice +# per epoch and has a data -weight of 1.5. +# There is no contraint that the average of the data weights equal one. +# Note: if a data-source has zero multiplicity, it just means you are ignoring +# it; but you must include all data-sources. +#cat > exp/foo/data_weights.txt < $dir/data_weights.txt < $dir/unigram_probs.txt + +# choose features +rnnlm/choose_features.py --unigram-probs=$dir/unigram_probs.txt \ + $data_dir/vocab/words.txt > $dir/features.txt +# validate features +rnnlm/validate_features.py $dir/features.txt + +# make features for word +rnnlm/make_word_features.py --unigram-probs=$dir/unigram_probs.txt \ + $data_dir/vocab/words.txt $dir/features.txt \ + > $dir/word_feats.txt + +# validate word features +rnnlm/validate_word_features.py --features-file $dir/features.txt \ + $dir/word_feats.txt diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh new file mode 100755 index 00000000000..9519ab3e87e --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson +# 2017 Hainan Xu +# 2017 Ke Li +# 2018 François Hernandez (Ubiqus) +# +# rnnlm/train_rnnlm.sh: best iteration (out of 1060) was 1050, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 90.0 / 92.0. + +# Begin configuration section. +dir=exp/rnnlm_lstm_tdnn_a +embedding_dim=800 +lstm_rpd=200 +lstm_nrpd=200 +stage=-10 +train_stage=-10 +epochs=20 + +. ./cmd.sh +. utils/parse_options.sh +[ -z "$cmd" ] && cmd=$train_cmd + +text_from_audio=data/train/text +text=data/rnnlm/train.txt.shuffled +wordlist=data/lang_chain/words.txt +dev_sents=10000 +text_dir=data/rnnlm/text +mkdir -p $dir/config +set -e + +for f in $text $wordlist; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for local/prepare_data.sh and utils/prepare_lang.sh in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + # shuffle text from audio and lm + cat $text_from_audio | cut -d ' ' -f2- | cat $text |\ + shuf > data/rnnlm/full_lm_data.shuffled + # create dev and train sets based on audio and LM data + cat data/rnnlm/full_lm_data.shuffled | head -n $dev_sents> $text_dir/dev.txt + cat data/rnnlm/full_lm_data.shuffled | tail -n +$[$dev_sents+1] > $text_dir/ted.txt + +fi + +if [ $stage -le 1 ]; then + cp $wordlist $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --top-word-features=10000 \ + --min-frequency 1.0e-03 \ + --special-words=',,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig < Date: Fri, 4 May 2018 17:03:45 +0200 Subject: [PATCH 05/35] add {cmd,path,results}.sh --- egs/tedlium/s5_r3/cmd.sh | 27 +++++++++++++++++++++++++++ egs/tedlium/s5_r3/path.sh | 6 ++++++ egs/tedlium/s5_r3/results.sh | 10 ++++++++++ 3 files changed, 43 insertions(+) create mode 100755 egs/tedlium/s5_r3/cmd.sh create mode 100755 egs/tedlium/s5_r3/path.sh create mode 100755 egs/tedlium/s5_r3/results.sh diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh new file mode 100755 index 00000000000..66ae9090820 --- /dev/null +++ b/egs/tedlium/s5_r3/cmd.sh @@ -0,0 +1,27 @@ +# "queue.pl" uses qsub. The options to it are +# options to qsub. If you have GridEngine installed, +# change this to a queue you have access to. +# Otherwise, use "run.pl", which will run jobs locally +# (make sure your --num-jobs options are no more than +# the number of cpus on your machine. + +# Run locally: +#export train_cmd=run.pl +#export decode_cmd=run.pl + +# JHU cluster (or most clusters using GridEngine, with a suitable +# conf/queue.conf). +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" + +host=$(hostname -f) +if [ ${host#*.} == "fit.vutbr.cz" ]; then + queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, + export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" + export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" + export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G" +elif [ ${host#*.} == "cm.cluster" ]; then + # MARCC bluecrab cluster: + export train_cmd="slurm.pl --time 4:00:00 " + export decode_cmd="slurm.pl --mem 4G --time 4:00:00 " +fi diff --git a/egs/tedlium/s5_r3/path.sh b/egs/tedlium/s5_r3/path.sh new file mode 100755 index 00000000000..16d5314b9c2 --- /dev/null +++ b/egs/tedlium/s5_r3/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/tedlium/s5_r3/results.sh b/egs/tedlium/s5_r3/results.sh new file mode 100755 index 00000000000..98bcab94ec5 --- /dev/null +++ b/egs/tedlium/s5_r3/results.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +filter_regexp=. +[ $# -ge 1 ] && filter_regexp=$1 + +for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null + for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp + for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp +exit 0 + From 45b300b8e4f184e62a8b3f65272409e9f4ff58e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 4 May 2018 17:10:59 +0200 Subject: [PATCH 06/35] add some unchanged scripts from r2 to r3 --- egs/tedlium/s5_r3/local/format_lms.sh | 39 +++ egs/tedlium/s5_r3/local/join_suffix.py | 26 ++ .../s5_r3/local/nnet3/run_ivector_common.sh | 238 ++++++++++++++++++ egs/tedlium/s5_r3/local/score.sh | 1 + 4 files changed, 304 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/format_lms.sh create mode 100755 egs/tedlium/s5_r3/local/join_suffix.py create mode 100755 egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh create mode 120000 egs/tedlium/s5_r3/local/score.sh diff --git a/egs/tedlium/s5_r3/local/format_lms.sh b/egs/tedlium/s5_r3/local/format_lms.sh new file mode 100755 index 00000000000..bba5bbd17ec --- /dev/null +++ b/egs/tedlium/s5_r3/local/format_lms.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# Apache 2.0 + +if [ -f path.sh ]; then . path.sh; fi + + +small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz +big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz + +for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + + +set -e + +if [ -f data/lang_nosp/G.fst ] && [ data/lang_nosp/G.fst -nt $small_arpa_lm ]; then + echo "$0: not regenerating data/lang_nosp/G.fst as it already exists and " + echo ".. is newer than the source LM." +else + arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \ + "gunzip -c $small_arpa_lm|" data/lang_nosp/G.fst + echo "$0: Checking how stochastic G is (the first of these numbers should be small):" + fstisstochastic data/lang_nosp/G.fst || true + utils/validate_lang.pl --skip-determinization-check data/lang_nosp +fi + + + +if [ -f data/lang_nosp_rescore/G.carpa ] && [ data/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \ + [ data/lang_nosp_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then + echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date." +else + utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp data/lang_nosp_rescore || exit 1; +fi + +exit 0; diff --git a/egs/tedlium/s5_r3/local/join_suffix.py b/egs/tedlium/s5_r3/local/join_suffix.py new file mode 100755 index 00000000000..64c62964331 --- /dev/null +++ b/egs/tedlium/s5_r3/local/join_suffix.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# +# Copyright 2014 Nickolay V. Shmyrev +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + + +import sys +from codecs import open + +# This script joins together pairs of split-up words like "you 're" -> "you're". +# The TEDLIUM transcripts are normalized in a way that's not traditional for +# speech recognition. + +for line in sys.stdin: + items = line.split() + new_items = [] + i = 1 + while i < len(items): + if i < len(items) - 1 and items[i+1][0] == '\'': + new_items.append(items[i] + items[i+1]) + i = i + 1 + else: + new_items.append(items[i]) + i = i + 1 + print(items[0] + ' ' + ' '.join(new_items)) diff --git a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..337092b1520 --- /dev/null +++ b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +set -e -o pipefail + + +# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually +# be called by more scripts). It contains the common feature preparation and iVector-related parts +# of the script. See those scripts for examples of usage. + + +stage=0 +nj=30 +min_seg_len=1.55 # min length in seconds... we do this because chain training + # will discard segments shorter than 1.5 seconds. Must remain in sync + # with the same option given to prepare_lores_feats_and_alignments.sh +train_set=train_cleaned # you might set this to e.g. train. +gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it + # becomes exp/nnet3_cleaned or whatever. + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp_comb + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 2." + exit 1 +fi + + +if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp dev test; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp dev test; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 3 ]; then + echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" + # we have to combine short segments or we won't be able to train chain models + # on those segments. + utils/data/combine_short_segments.sh \ + data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb + + # just copy over the CMVN to avoid having to recompute it. + cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/ + utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/ +fi + +if [ $stage -le 4 ]; then + echo "$0: selecting segments of hires training data that were also present in the" + echo " ... original training data." + + # note, these data-dirs are temporary; we put them in a sub-directory + # of the place where we'll make the alignments. + temp_data_root=exp/nnet3${nnet3_affix}/tri5 + mkdir -p $temp_data_root + + utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \ + data/${train_set}_sp_hires $temp_data_root/${train_set}_hires + + # note: essentially all the original segments should be in the hires data. + n1=$(wc -l Date: Tue, 22 May 2018 10:43:00 +0200 Subject: [PATCH 07/35] add download script --- egs/tedlium/s5_r3/local/download_data.sh | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/download_data.sh diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh new file mode 100755 index 00000000000..49de5b12372 --- /dev/null +++ b/egs/tedlium/s5_r3/local/download_data.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 John Hopkins University (author: Daniel Povey) +# Apache 2.0 + +mkdir -p db + +cd db ### Note: the rest of this script is executed from the directory 'db'. + +# TED-LIUM database: +if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then + if [ ! -e TEDLIUM_release-3 ]; then + ln -sf /export/corpora5/TEDLIUM_release-3 + fi + echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3" +else + if [ ! -e TEDLIUM_release-3 ]; then + echo "$0: downloading TEDLIUM_release-3 data (it won't re-download if it was already downloaded.)" + # the following command won't re-get it if it's already there + # because of the --continue switch. + wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1 + tar xf "TEDLIUM_release-3.tar.gz" + else + echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists." + fi +fi + + +num_sph=$(find TEDLIUM_release-3/data -name '*.sph' | wc -l) +if [ "$num_sph" != 2351 ]; then + echo "$0: expected to find 2351 .sph files in the directory db/TEDLIUM_release-3, found $num_sph" + exit 1 +fi + +exit 0 + From 58f7343a163c0b29fd86f85f1a6c8379844f8295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 10:56:54 +0200 Subject: [PATCH 08/35] local/prepare_data.sh --- egs/tedlium/s5_r3/local/prepare_data.sh | 76 +++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/prepare_data.sh diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh new file mode 100755 index 00000000000..ea6241f7c29 --- /dev/null +++ b/egs/tedlium/s5_r3/local/prepare_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Johns Hopkins University (Author: Daniel Povey) +# 2018 François Hernandez +# +# Apache 2.0 + +# To be run from one directory above this script. + +. ./path.sh + +export LC_ALL=C + +# Prepare: test, train, +for set in dev test train; do + dir=data/$set.orig + mkdir -p $dir + + # Merge transcripts into a single 'stm' file, do some mappings: + # - -> : map dev stm labels to be coherent with train + test, + # - -> : --||-- + # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary + # - -> null : remove marked , it is modelled implicitly (in kaldi) + # - (...) -> null : remove utterance names from end-lines of train + # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py) + { # Add STM header, so sclite can prepare the '.lur' file + echo ';; +;; LABEL "o" "Overall" "Overall results" +;; LABEL "f0" "f0" "Wideband channel" +;; LABEL "f2" "f2" "Telephone channel" +;; LABEL "male" "Male" "Male Talkers" +;; LABEL "female" "Female" "Female Talkers" +;;' + # Process the STMs + cat db/TEDLIUM_release-3/legacy/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \ + sed -e 's:([^ ]*)$::' | \ + awk '{ $2 = "A"; print $0; }' + } > data/$set.orig/stm + + # Prepare 'text' file + # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary + cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \ + awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100); + for (i=7;i<=NF;i++) { printf(" %s", $i); } + printf("\n"); + }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1 + + # Prepare 'segments', 'utt2spk', 'spk2utt' + cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments + cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk + cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt + + # Prepare 'wav.scp', 'reco2file_and_channel' + cat $dir/spk2utt | awk -v set=$set -v pwd=$PWD '{ printf("%s sph2pipe -f wav -p %s/db/TEDLIUM_release-3/legacy/%s/sph/%s.sph |\n", $1, pwd, set, $1); }' > $dir/wav.scp + cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel + + # Create empty 'glm' file + echo ';; empty.glm + [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token + ' > data/$set.orig/glm + + # The training set seems to not have enough silence padding in the segmentations, + # especially at the beginning of segments. Extend the times. + if [ $set == "train" ]; then + mv data/$set.orig/segments data/$set.orig/segments.temp + utils/data/extend_segment_times.py --start-padding=0.15 \ + --end-padding=0.1 data/$set.orig/segments || exit 1 + rm data/$set.orig/segments.temp + fi + + # Check that data dirs are okay! + utils/validate_data_dir.sh --no-feats $dir || exit 1 +done + From abae0fb849b1253fddcec354b628cd6e4c2d7156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 10:58:07 +0200 Subject: [PATCH 09/35] local/prepare_dict.sh --- egs/tedlium/s5_r3/local/prepare_dict.sh | 38 +++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/prepare_dict.sh diff --git a/egs/tedlium/s5_r3/local/prepare_dict.sh b/egs/tedlium/s5_r3/local/prepare_dict.sh new file mode 100755 index 00000000000..3cdbcb3fdf6 --- /dev/null +++ b/egs/tedlium/s5_r3/local/prepare_dict.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Daniel Galvez +# 2016 Vincent Nguyen +# Apache 2.0 +# + +dir=data/local/dict_nosp +mkdir -p $dir + +srcdict=db//TEDLIUM_release-3/TEDLIUM.152k.dic + +[ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 + +# Join dicts and fix some troubles +cat $srcdict | grep -v -w "" | grep -v -w "" | grep -v -w "" | \ + LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt + +cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ + grep -v SIL | sort > $dir/nonsilence_phones.txt + +( echo SIL; echo NSN ) > $dir/silence_phones.txt + +echo SIL > $dir/optional_silence.txt + +# No "extra questions" in the input to this setup, as we don't +# have stress or tone. +echo -n >$dir/extra_questions.txt + +# Add to the lexicon the silences, noises etc. +# Typically, you would use " NSN" here, but the Cantab Research language models +# use instead of to represent out of vocabulary words. +echo ' NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt + +# Check that the dict dir is okay! +utils/validate_dict_dir.pl $dir || exit 1 From 6a2ae2949e4ee33c430fc16c17dd5b81fb505fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 11:36:49 +0200 Subject: [PATCH 10/35] add option to download lms --- egs/tedlium/s5_r3/local/ted_download_lm.sh | 16 +++ egs/tedlium/s5_r3/local/ted_train_lm.sh | 139 +++++++++++++++++++++ egs/tedlium/s5_r3/run.sh | 15 ++- 3 files changed, 166 insertions(+), 4 deletions(-) create mode 100644 egs/tedlium/s5_r3/local/ted_download_lm.sh create mode 100755 egs/tedlium/s5_r3/local/ted_train_lm.sh diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh new file mode 100644 index 00000000000..ad833555b5f --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_download_lm.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# Copyright 2018 David Snyder +# Apache 2.0 +# +# This script downloads pre-built language models trained on the Cantab-Tedlium +# text data and Tedlium acoustic training data. If you want to build these +# models yourself, run the script local/ted_train_lm.sh. + +set -e + +echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)" +wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P data/local/local_lm/data/arpa || exit 1 +wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P data/local/local_lm/data/arpa || exit 1 + +exit 0 \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/ted_train_lm.sh b/egs/tedlium/s5_r3/local/ted_train_lm.sh new file mode 100755 index 00000000000..20ea2ca3216 --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_train_lm.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 +# +# This script trains a LM on the Cantab-Tedlium text data and tedlium acoustic training data. +# It is based on the example scripts distributed with PocoLM + +# It will first check if pocolm is installed and if not will process with installation +# It will then get the source data from the pre-downloaded Cantab-Tedlium files +# and the pre-prepared data/train text source. + + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +num_dev_sentences=10000 + +#bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Unzip TEDLIUM 6 data sources, normalize apostrophe+suffix to previous word, gzip the result. + gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c > ${dir}/data/text/train.txt.gz + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + head -n $num_dev_sentences < data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + # .. and the rest of the training data as an additional data source. + # we can later fold the dev data into this. + tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/ted.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/ted.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/dev/text > ${dir}/data/real_dev_set.txt + + # get wordlist + awk '{print $1}' db/TEDLIUM_release-3/TEDLIUM.152k.dic | sed 's:([0-9])::g' | sort | uniq > ${dir}/data/wordlist +fi + +order=4 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 ted=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=ted ${bypass_metaparam_optim_opt} \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #[perplexity = 157.87] over 18290.0 words +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 10 million n-grams for a big LM for rescoring purposes. + size=10000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true: + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words. + + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 2 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=2000000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words. + + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh index 7147476fe52..ad568891ddb 100755 --- a/egs/tedlium/s5_r3/run.sh +++ b/egs/tedlium/s5_r3/run.sh @@ -3,7 +3,7 @@ # Based mostly on the Switchboard recipe. The training database is TED-LIUM, # it consists of TED talks with cleaned automatic transcripts: # -# http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus +# https://lium.univ-lemans.fr/ted-lium3/ # http://www.openslr.org/resources (Mirror). # # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license, @@ -28,8 +28,9 @@ nj=35 decode_nj=30 # note: should not be >38 which is the number of speakers in the dev set # after applying --seconds-per-spk-max 180. We decode with 4 threads, so # this will be too many jobs if you're using run.pl. -stage=0 +stage=5 train_rnnlm=true +train_lms=false . utils/parse_options.sh # accept options @@ -63,13 +64,19 @@ if [ $stage -le 4 ]; then # later on we'll change this script so you have the option to # download the pre-built LMs from openslr.org instead of building them # locally. - local/ted_train_lm.sh + if $train_lms; then + local/ted_train_lm.sh + else + local/ted_download_lm.sh + fi fi if [ $stage -le 5 ]; then local/format_lms.sh fi +exit + # Feature extraction if [ $stage -le 6 ]; then for set in test dev train; do @@ -202,7 +209,7 @@ if [ $stage -le 19 ]; then lang_dir=data/lang_chain ngram_order=4 - for set in dev test; do + for dset in dev test; do data_dir=data/${set}_hires decoding_dir=exp/chain/ # TODO path to tdnn dev and test decoding dirs suffix=$(basename $rnnlm_dir) From 1ac8696a8ccd76872a28b58b3f2ccf636cd21e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 11:40:20 +0200 Subject: [PATCH 11/35] remove local/join_suffix.py --- egs/tedlium/s5_r3/local/join_suffix.py | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100755 egs/tedlium/s5_r3/local/join_suffix.py diff --git a/egs/tedlium/s5_r3/local/join_suffix.py b/egs/tedlium/s5_r3/local/join_suffix.py deleted file mode 100755 index 64c62964331..00000000000 --- a/egs/tedlium/s5_r3/local/join_suffix.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2014 Nickolay V. Shmyrev -# 2016 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - - -import sys -from codecs import open - -# This script joins together pairs of split-up words like "you 're" -> "you're". -# The TEDLIUM transcripts are normalized in a way that's not traditional for -# speech recognition. - -for line in sys.stdin: - items = line.split() - new_items = [] - i = 1 - while i < len(items): - if i < len(items) - 1 and items[i+1][0] == '\'': - new_items.append(items[i] + items[i+1]) - i = i + 1 - else: - new_items.append(items[i]) - i = i + 1 - print(items[0] + ' ' + ' '.join(new_items)) From ceb03deace54a21c0d3b51ac5ecee0bedcc009e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 11:43:35 +0200 Subject: [PATCH 12/35] local/run_cleanup_segmentation.sh stage 16 --- .../s5_r3/local/run_cleanup_segmentation.sh | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh diff --git a/egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh b/egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh new file mode 100755 index 00000000000..559d20046dd --- /dev/null +++ b/egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# The basic idea is to decode with an existing in-domain acoustic model, and a +# biased language model built from the reference, and then work out the +# segmentation from a ctm like file. + +# For nnet3 and chain results after cleanup, see the scripts in +# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh + +# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets +# [will add these later]. + +set -e +set -o pipefail +set -u + +stage=0 +cleanup_stage=0 +data=data/train +cleanup_affix=cleaned +srcdir=exp/tri3 +nj=100 +decode_nj=16 +decode_num_threads=4 + +. ./path.sh +. ./cmd.sh +. utils/parse_options.sh + +cleaned_data=${data}_${cleanup_affix} + +dir=${srcdir}_${cleanup_affix}_work +cleaned_dir=${srcdir}_${cleanup_affix} + +if [ $stage -le 1 ]; then + # This does the actual data cleanup. + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + $data data/lang $srcdir $dir $cleaned_data +fi + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix} +fi + +if [ $stage -le 3 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} +fi + +if [ $stage -le 4 ]; then + # Test with the models trained on cleaned-up data. + utils/mkgraph.sh data/lang ${cleaned_dir} ${cleaned_dir}/graph + + for dset in dev test; do + steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \ + --cmd "$decode_cmd" --num-threads 4 \ + ${cleaned_dir}/graph data/${dset} ${cleaned_dir}/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} ${cleaned_dir}/decode_${dset} ${cleaned_dir}/decode_${dset}_rescore + done +fi From 52f70e12ea970b7e52ab6ed812bf4c2a97117ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 12:10:50 +0200 Subject: [PATCH 13/35] add run_tdnnf.sh link --- egs/tedlium/s5_r3/local/chain/run_tdnnf.sh | 1 + egs/tedlium/s5_r3/run.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 120000 egs/tedlium/s5_r3/local/chain/run_tdnnf.sh diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh new file mode 120000 index 00000000000..cbbf0ed6533 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh @@ -0,0 +1 @@ +tuning/run_tdnnf_1a.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh index ad568891ddb..d4c123abfaa 100755 --- a/egs/tedlium/s5_r3/run.sh +++ b/egs/tedlium/s5_r3/run.sh @@ -190,7 +190,7 @@ fi if [ $stage -le 17 ]; then # This will only work if you have GPUs on your system (and note that it requires # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html) - local/chain/tuning/run_tdnn_PR2114.sh + local/chain/run_tdnnf.sh fi From 252c70d3b38c1588945c974a504538ae64a9beeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 12:24:52 +0200 Subject: [PATCH 14/35] clean header chain scripts --- .../s5_r3/local/chain/tuning/run_tdnn_1a.sh | 235 ++++++++++++++++++ .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 34 ++- 2 files changed, 250 insertions(+), 19 deletions(-) create mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..499fa7f6d49 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# See run_tdnnf_1a.sh for comparative results. + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism +# to get the configuration. + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1f #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir=exp/chain_cleaned/egs # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +export CUDA_VISIBLE_DEVICES=0,1,2 + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 self-repair-scale=1.0e-04 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=1024 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=1024 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=1024 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=1024 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 6 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh index 9cf4e00a0b3..8570e54c626 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh @@ -1,21 +1,19 @@ #!/bin/bash -# TODO clean this header !!! -# run_tdnn_1f.sh is like run_tdnn_1e.sh but it use 2 to 6 jobs and add proportional-shrink 20. - -#exp/chain_cleaned/tdnn1e_sp_bi/: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.095->-0.095 xent:train/valid[167,252,final]=(-1.37,-1.31,-1.31/-1.47,-1.44,-1.44) logprob:train/valid[167,252,final]=(-0.087,-0.078,-0.078/-0.102,-0.099,-0.099) -#exp/chain_cleaned/tdnn1f_sp_bi/: num-iters=444 nj=2..6 num-params=7.0M dim=40+100->3603 combine=-0.114->-0.113 xent:train/valid[295,443,final]=(-1.59,-1.51,-1.49/-1.58,-1.52,-1.50) logprob:train/valid[295,443,final]=(-0.112,-0.102,-0.098/-0.122,-0.113,-0.110) - -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1d_sp_bi exp/chain_cleaned/tdnn1e_sp_bi -# System tdnn1e_sp_bi tdnn1f_sp_bi -# WER on dev(orig) 9.2 9.0 -# WER on dev(rescored) 8.6 8.2 -# WER on test(orig) 9.4 9.1 -# WER on test(rescored) 8.9 8.7 -# Final train prob -0.0776 -0.0983 -# Final valid prob -0.0992 -0.1103 -# Final train prob (xent) -1.3110 -1.4893 -# Final valid prob (xent) -1.4353 -1.4951 +# run_tdnnf_1a.sh is the script which results are presented in the corpus release paper. +# It use 2 to 6 jobs and add proportional-shrink 10. + +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnnf_1a +# System tdnn_1a tdnnf_1a +# WER on dev(orig) 8.2 7.9 +# WER on dev(rescored) 7.6 7.2 +# WER on test(orig) 8.1 8.0 +# WER on test(rescored) 7.7 7.5 +# Final train prob -0.0802 -0.0779 +# Final valid prob -0.0980 -0.0906 +# Final train prob (xent) -1.1450 -0.9021 +# Final valid prob (xent) -1.2498 -0.9971 + ## how you run this (note: this assumes that the run_tdnn.sh soft link points here; ## otherwise call it directly in its location). @@ -28,14 +26,12 @@ # note, if you have already run the corresponding non-chain nnet3 system # (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. -# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism -# to get the configuration. set -e -o pipefail # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). -stage=17 +stage=0 nj=30 decode_nj=30 min_seg_len=1.55 From 6f9bd8b1c8e5e257faeebab9c17b5023060b85a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 14:24:30 +0200 Subject: [PATCH 15/35] clean chain tuning naming --- egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh | 6 ++---- egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh index 499fa7f6d49..7a393db663c 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -34,8 +34,8 @@ nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned # are just hardcoded at this level, in the commands below. train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix=1f #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/egs # you can set this to use previously dumped egs. +tdnn_affix=_1a #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. # End configuration section. echo "$0 $@" # Print the command line for logging @@ -44,8 +44,6 @@ echo "$0 $@" # Print the command line for logging . ./path.sh . ./utils/parse_options.sh -export CUDA_VISIBLE_DEVICES=0,1,2 - if ! cuda-compiled; then cat < Date: Tue, 22 May 2018 14:25:34 +0200 Subject: [PATCH 16/35] some lm related scripts --- egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh | 9 ++++++--- egs/tedlium/s5_r3/local/ted_download_lm.sh | 0 egs/tedlium/s5_r3/local/ted_download_rnnlm.sh | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) mode change 100644 => 100755 egs/tedlium/s5_r3/local/ted_download_lm.sh create mode 100755 egs/tedlium/s5_r3/local/ted_download_rnnlm.sh diff --git a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh index 9ae9307d93d..61ad07645ff 100755 --- a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh +++ b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh @@ -21,7 +21,7 @@ end= . utils/parse_options.sh # accept options # get the best iteration -best_iter=$(rnnlm/get_best_model.py $dir) +best_iter=$(rnnlm/get_best_model.py $rnnlm_dir) # get num_iters info=$(grep "num_iters" $rnnlm_dir/info.txt) @@ -44,11 +44,14 @@ fi models="" embeddings="" for num in $(seq -s' ' $begin $end); do - models=$models" $rnnlm_dir/$num.raw" - embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat" + [ -f $rnnlm_dir/$num.raw ] && \ + models=$models" $rnnlm_dir/$num.raw" + [ -f $rnnlm_dir/feat_embedding.$num.mat ] && \ + embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat" done # merge list of files +mkdir -p ${rnnlm_dir}_averaged nnet3-average $models ${rnnlm_dir}_averaged/final.raw matrix-sum --average=true $embeddings ${rnnlm_dir}_averaged/feat_embedding.final.mat diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh old mode 100644 new mode 100755 diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh new file mode 100755 index 00000000000..fb85be9e897 --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Copyright 2018 François Hernandez +# Apache 2.0 +# +# This script downloads pre-built RNN language models trained on the TED-LIUM +# text data and acoustic training data. If you want to build these +# models yourself, run the script local/ted_train_rnnlm.sh. + +set -e + +echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)" +wget --continue http://kaldi-asr.org/models/6/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1 +tar -xvzf exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz || exit 1 +rm exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz + +exit 0 \ No newline at end of file From 442d22c1ef63beade5d2951e622502649a815001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 14:29:10 +0200 Subject: [PATCH 17/35] minor change run.sh --- egs/tedlium/s5_r3/run.sh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh index d4c123abfaa..7bb384fe314 100755 --- a/egs/tedlium/s5_r3/run.sh +++ b/egs/tedlium/s5_r3/run.sh @@ -29,8 +29,8 @@ decode_nj=30 # note: should not be >38 which is the number of speakers in the # after applying --seconds-per-spk-max 180. We decode with 4 threads, so # this will be too many jobs if you're using run.pl. stage=5 -train_rnnlm=true -train_lms=false +train_rnnlm=false +train_lm=false . utils/parse_options.sh # accept options @@ -64,7 +64,7 @@ if [ $stage -le 4 ]; then # later on we'll change this script so you have the option to # download the pre-built LMs from openslr.org instead of building them # locally. - if $train_lms; then + if $train_lm; then local/ted_train_lm.sh else local/ted_download_lm.sh @@ -195,14 +195,18 @@ fi if [ $stage -le 18 ]; then - # todo add option to choose between training and downloading + # You can either train your own rnnlm or download a pre-trained one if $train_rnnlm; then local/rnnlm/tuning/run_lstm_tdnn_a.sh local/rnnlm/average_rnnlm.sh + else + local/ted_download_rnnlm.sh fi fi + + if [ $stage -le 19 ]; then # Here we rescore the lattices generated at stage 17 rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged @@ -211,7 +215,7 @@ if [ $stage -le 19 ]; then for dset in dev test; do data_dir=data/${set}_hires - decoding_dir=exp/chain/ # TODO path to tdnn dev and test decoding dirs + decoding_dir=exp/chain/tdnnf_1a suffix=$(basename $rnnlm_dir) output_dir=${decoding_dir}_$suffix From 50109119d048c1fc1c51b4be89d1459b3a59672c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 14:30:44 +0200 Subject: [PATCH 18/35] reset stage run --- egs/tedlium/s5_r3/run.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh index 7bb384fe314..9bf240dd9d3 100755 --- a/egs/tedlium/s5_r3/run.sh +++ b/egs/tedlium/s5_r3/run.sh @@ -28,7 +28,7 @@ nj=35 decode_nj=30 # note: should not be >38 which is the number of speakers in the dev set # after applying --seconds-per-spk-max 180. We decode with 4 threads, so # this will be too many jobs if you're using run.pl. -stage=5 +stage=0 train_rnnlm=false train_lm=false @@ -75,8 +75,6 @@ if [ $stage -le 5 ]; then local/format_lms.sh fi -exit - # Feature extraction if [ $stage -le 6 ]; then for set in test dev train; do From dbb4440812d6afa090827e128cfd1d95a0e4cfa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 14:32:29 +0200 Subject: [PATCH 19/35] cosmetic --- egs/tedlium/s5_r3/run.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh index 9bf240dd9d3..74904fbd1ac 100755 --- a/egs/tedlium/s5_r3/run.sh +++ b/egs/tedlium/s5_r3/run.sh @@ -176,7 +176,6 @@ if [ $stage -le 15 ]; then done fi - if [ $stage -le 16 ]; then # this does some data-cleaning. It actually degrades the GMM-level results # slightly, but the cleaned data should be useful when we add the neural net and chain @@ -184,14 +183,12 @@ if [ $stage -le 16 ]; then local/run_cleanup_segmentation.sh fi - if [ $stage -le 17 ]; then # This will only work if you have GPUs on your system (and note that it requires # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html) local/chain/run_tdnnf.sh fi - if [ $stage -le 18 ]; then # You can either train your own rnnlm or download a pre-trained one if $train_rnnlm; then @@ -202,9 +199,6 @@ if [ $stage -le 18 ]; then fi fi - - - if [ $stage -le 19 ]; then # Here we rescore the lattices generated at stage 17 rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged @@ -226,6 +220,5 @@ if [ $stage -le 19 ]; then done fi - echo "$0: success." exit 0 From 62b8826611ecc4a85554f3e03f494d87090591b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 14:37:42 +0200 Subject: [PATCH 20/35] add rnnlm results --- egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh index 9519ab3e87e..302c67d1243 100755 --- a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh +++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh @@ -8,6 +8,14 @@ # rnnlm/train_rnnlm.sh: best iteration (out of 1060) was 1050, linking it to final iteration. # rnnlm/train_rnnlm.sh: train/dev perplexity was 90.0 / 92.0. +# System tdnn_1a tdnnf_1a +# WER on dev(orig) 8.2 7.9 +# WER on dev(ngram) 7.6 7.2 +# WER on dev(rnnlm) 6.3 6.1 +# WER on test(orig) 8.1 8.0 +# WER on test(ngram) 7.7 7.5 +# WER on test(rnnlm) 6.7 6.6 + # Begin configuration section. dir=exp/rnnlm_lstm_tdnn_a embedding_dim=800 From cc6284183c4df5533b682b5fa2262a5f7b110b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 14:42:58 +0200 Subject: [PATCH 21/35] LM corpus for rnnlm --- egs/tedlium/s5_r3/local/prepare_data.sh | 3 +++ egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh | 1 + egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) create mode 120000 egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh index ea6241f7c29..c8a9e0a8665 100755 --- a/egs/tedlium/s5_r3/local/prepare_data.sh +++ b/egs/tedlium/s5_r3/local/prepare_data.sh @@ -13,6 +13,9 @@ export LC_ALL=C +# Prepare LM data +gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c > data/LM/train.txt + # Prepare: test, train, for set in dev test train; do dir=data/$set.orig diff --git a/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh new file mode 120000 index 00000000000..72a3172db41 --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh @@ -0,0 +1 @@ +tuning/run_lstm_tdnn_a.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh index 302c67d1243..32252db937d 100755 --- a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh +++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh @@ -30,7 +30,7 @@ epochs=20 [ -z "$cmd" ] && cmd=$train_cmd text_from_audio=data/train/text -text=data/rnnlm/train.txt.shuffled +text=data/LM/train.txt wordlist=data/lang_chain/words.txt dev_sents=10000 text_dir=data/rnnlm/text From c022a0a813295e9344c43611114735d9af8a439b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 22 May 2018 14:49:16 +0200 Subject: [PATCH 22/35] remove useless config files --- egs/tedlium/s5_r3/conf/decode.config | 1 - egs/tedlium/s5_r3/conf/decode_dnn.config | 2 -- egs/tedlium/s5_r3/conf/fbank.conf | 5 ----- egs/tedlium/s5_r3/conf/no_k20.conf | 13 ------------- egs/tedlium/s5_r3/conf/pitch.conf | 2 -- 5 files changed, 23 deletions(-) delete mode 100644 egs/tedlium/s5_r3/conf/decode.config delete mode 100644 egs/tedlium/s5_r3/conf/decode_dnn.config delete mode 100644 egs/tedlium/s5_r3/conf/fbank.conf delete mode 100644 egs/tedlium/s5_r3/conf/no_k20.conf delete mode 100644 egs/tedlium/s5_r3/conf/pitch.conf diff --git a/egs/tedlium/s5_r3/conf/decode.config b/egs/tedlium/s5_r3/conf/decode.config deleted file mode 100644 index 7ba966f2b83..00000000000 --- a/egs/tedlium/s5_r3/conf/decode.config +++ /dev/null @@ -1 +0,0 @@ -# empty config, just use the defaults. diff --git a/egs/tedlium/s5_r3/conf/decode_dnn.config b/egs/tedlium/s5_r3/conf/decode_dnn.config deleted file mode 100644 index ab8dcc1dc08..00000000000 --- a/egs/tedlium/s5_r3/conf/decode_dnn.config +++ /dev/null @@ -1,2 +0,0 @@ -beam=13.0 # beam for decoding. Was 13.0 in the scripts. -lattice_beam=8.0 # this has most effect on size of the lattices. diff --git a/egs/tedlium/s5_r3/conf/fbank.conf b/egs/tedlium/s5_r3/conf/fbank.conf deleted file mode 100644 index 4c57f8a8765..00000000000 --- a/egs/tedlium/s5_r3/conf/fbank.conf +++ /dev/null @@ -1,5 +0,0 @@ ---window-type=hamming # disable Dans window, use the standard ---use-energy=false # only fbank outputs ---dither=1 ---num-mel-bins=40 # 8 filters/octave, 40 filters/16Khz as used by IBM ---htk-compat=true # try to make it compatible with HTK diff --git a/egs/tedlium/s5_r3/conf/no_k20.conf b/egs/tedlium/s5_r3/conf/no_k20.conf deleted file mode 100644 index f0cba4df971..00000000000 --- a/egs/tedlium/s5_r3/conf/no_k20.conf +++ /dev/null @@ -1,13 +0,0 @@ -# Default configuration -command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -option mem=* -l mem_free=$0,ram_free=$0 -option mem=0 # Do not add anything to qsub_opts -option num_threads=* -pe smp $0 -option num_threads=1 # Do not add anything to qsub_opts -option max_jobs_run=* -tc $0 -default gpu=0 -option gpu=0 -q all.q -option gpu=* -l gpu=$0 -q g.q -default allow_k20=true -option allow_k20=true -option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' diff --git a/egs/tedlium/s5_r3/conf/pitch.conf b/egs/tedlium/s5_r3/conf/pitch.conf deleted file mode 100644 index bba51335be3..00000000000 --- a/egs/tedlium/s5_r3/conf/pitch.conf +++ /dev/null @@ -1,2 +0,0 @@ ---nccf-ballast-online=true # helps for online operation. - From 774b2533b07a8c9c90a4af91325334dcaeb26009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Wed, 23 May 2018 09:39:16 +0200 Subject: [PATCH 23/35] remove host stuff from cmd.sh --- egs/tedlium/s5_r3/cmd.sh | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh index 66ae9090820..23a2b7b6a51 100755 --- a/egs/tedlium/s5_r3/cmd.sh +++ b/egs/tedlium/s5_r3/cmd.sh @@ -12,16 +12,4 @@ # JHU cluster (or most clusters using GridEngine, with a suitable # conf/queue.conf). export train_cmd="queue.pl" -export decode_cmd="queue.pl --mem 4G" - -host=$(hostname -f) -if [ ${host#*.} == "fit.vutbr.cz" ]; then - queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, - export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" - export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" - export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G" -elif [ ${host#*.} == "cm.cluster" ]; then - # MARCC bluecrab cluster: - export train_cmd="slurm.pl --time 4:00:00 " - export decode_cmd="slurm.pl --mem 4G --time 4:00:00 " -fi +export decode_cmd="queue.pl --mem 4G" \ No newline at end of file From 3855c7b112c564b533380f5f3ec857daeebeb76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Wed, 23 May 2018 09:40:47 +0200 Subject: [PATCH 24/35] change rnnlm download link --- egs/tedlium/s5_r3/local/ted_download_rnnlm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh index fb85be9e897..609f0194541 100755 --- a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh +++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh @@ -10,7 +10,7 @@ set -e echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)" -wget --continue http://kaldi-asr.org/models/6/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1 +wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1 tar -xvzf exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz || exit 1 rm exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz From aed82134f9a81fd58bad6c1aa256944074331826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 24 May 2018 09:43:47 +0200 Subject: [PATCH 25/35] change tdnnf affix --- egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh index a5ce24454b1..1098e5598ef 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh @@ -45,7 +45,7 @@ nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned # are just hardcoded at this level, in the commands below. train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix=f_1a #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +tdnnf_affix=_1a #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. # End configuration section. @@ -77,7 +77,7 @@ gmm_dir=exp/$gmm ali_dir=exp/${gmm}_ali_${train_set}_sp_comb tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats -dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix} +dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix} train_data_dir=data/${train_set}_sp_hires_comb lores_train_data_dir=data/${train_set}_sp_comb train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb From 4a6a5072dd54eb6b7c7cb8d71850c3e458fdfb36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 24 May 2018 10:28:04 +0200 Subject: [PATCH 26/35] change chunk width tdnn --- egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh index 7a393db663c..c7357310b84 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -185,7 +185,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width 150,110,100 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ From 09a849aaf9a718975f392eb17e22293ae3a8c667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 24 May 2018 10:29:52 +0200 Subject: [PATCH 27/35] update ivector common strategy --- .../s5_r3/local/chain/tuning/run_tdnn_1a.sh | 10 +- .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 10 +- .../s5_r3/local/nnet3/run_ivector_common.sh | 114 +++++------------- 3 files changed, 40 insertions(+), 94 deletions(-) diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh index c7357310b84..5e19fb5f0a0 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -63,13 +63,13 @@ local/nnet3/run_ivector_common.sh --stage $stage \ gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_1024_ps10 -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb +train_data_dir=data/${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh index 1098e5598ef..d4c2a0e0215 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh @@ -74,13 +74,13 @@ local/nnet3/run_ivector_common.sh --stage $stage \ gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix} -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb +train_data_dir=data/${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ diff --git a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh index 337092b1520..5322da6240f 100755 --- a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh +++ b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh @@ -10,9 +10,7 @@ set -e -o pipefail stage=0 nj=30 -min_seg_len=1.55 # min length in seconds... we do this because chain training - # will discard segments shorter than 1.5 seconds. Must remain in sync - # with the same option given to prepare_lores_feats_and_alignments.sh + train_set=train_cleaned # you might set this to e.g. train. gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; # it should contain alignments for 'train_set'. @@ -27,7 +25,7 @@ nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in gmm_dir=exp/${gmm} -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do if [ ! -f $f ]; then @@ -79,64 +77,23 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data" - # we have to combine short segments or we won't be able to train chain models - # on those segments. - utils/data/combine_short_segments.sh \ - data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb - - # just copy over the CMVN to avoid having to recompute it. - cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/ - utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/ -fi - -if [ $stage -le 4 ]; then - echo "$0: selecting segments of hires training data that were also present in the" - echo " ... original training data." - - # note, these data-dirs are temporary; we put them in a sub-directory - # of the place where we'll make the alignments. - temp_data_root=exp/nnet3${nnet3_affix}/tri5 - mkdir -p $temp_data_root - - utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \ - data/${train_set}_sp_hires $temp_data_root/${train_set}_hires - - # note: essentially all the original segments should be in the hires data. - n1=$(wc -l Date: Thu, 24 May 2018 10:31:32 +0200 Subject: [PATCH 28/35] remove bi suffix and small fix --- egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh | 4 ++-- egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh index 5e19fb5f0a0..e6613529ab6 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -64,9 +64,9 @@ local/nnet3/run_ivector_common.sh --stage $stage \ gmm_dir=exp/$gmm ali_dir=exp/${gmm}_ali_${train_set}_sp -tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} +tree_dir=exp/chain${nnet3_affix}/tree${tree_affix} lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats -dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_1024_ps10 +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp train_data_dir=data/${train_set}_sp_hires lores_train_data_dir=data/${train_set}_sp train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh index d4c2a0e0215..cb468748b47 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh @@ -75,7 +75,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \ gmm_dir=exp/$gmm ali_dir=exp/${gmm}_ali_${train_set}_sp -tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix} +tree_dir=exp/chain${nnet3_affix}/tree${tree_affix} lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix} train_data_dir=data/${train_set}_sp_hires From 27067fc9cb69e24cc735b7bc1e91f870d87fceb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Wed, 6 Jun 2018 10:58:21 +0200 Subject: [PATCH 29/35] fix join_suffix stm data prep, add scoring scripts --- egs/tedlium/s5_r3/local/prepare_data.sh | 5 +- egs/tedlium/s5_r3/local/score.sh | 2 +- egs/tedlium/s5_r3/local/score_basic.sh | 55 ++++++++++++++ egs/tedlium/s5_r3/local/score_sclite.sh | 96 +++++++++++++++++++++++++ egs/tedlium/s5_r3/local/ted_train_lm.sh | 2 +- 5 files changed, 154 insertions(+), 6 deletions(-) create mode 100755 egs/tedlium/s5_r3/local/score_basic.sh create mode 100755 egs/tedlium/s5_r3/local/score_sclite.sh diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh index c8a9e0a8665..8de1752742b 100755 --- a/egs/tedlium/s5_r3/local/prepare_data.sh +++ b/egs/tedlium/s5_r3/local/prepare_data.sh @@ -13,9 +13,6 @@ export LC_ALL=C -# Prepare LM data -gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c > data/LM/train.txt - # Prepare: test, train, for set in dev test train; do dir=data/$set.orig @@ -40,7 +37,7 @@ for set in dev test train; do cat db/TEDLIUM_release-3/legacy/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \ sed -e 's:([^ ]*)$::' | \ awk '{ $2 = "A"; print $0; }' - } > data/$set.orig/stm + } | local/join_suffix.py > data/$set.orig/stm # Prepare 'text' file # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary diff --git a/egs/tedlium/s5_r3/local/score.sh b/egs/tedlium/s5_r3/local/score.sh index d89286dc25a..f2835abb6d9 120000 --- a/egs/tedlium/s5_r3/local/score.sh +++ b/egs/tedlium/s5_r3/local/score.sh @@ -1 +1 @@ -score_sclite.sh \ No newline at end of file +score_basic.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/score_basic.sh b/egs/tedlium/s5_r3/local/score_basic.sh new file mode 100755 index 00000000000..47b57396c64 --- /dev/null +++ b/egs/tedlium/s5_r3/local/score_basic.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +min_lmwt=7 +max_lmwt=17 +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text > $dir/scoring/test_filt.txt + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ + lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \ + "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1; + +# Note: the double level of quoting for the sed command + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ + cat $dir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| \ + sed "'s:::g'" \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; + +# Show results +for f in $dir/wer_*; do echo $f; egrep '(WER)|(SER)' < $f; done + +exit 0; diff --git a/egs/tedlium/s5_r3/local/score_sclite.sh b/egs/tedlium/s5_r3/local/score_sclite.sh new file mode 100755 index 00000000000..16c8b30e52f --- /dev/null +++ b/egs/tedlium/s5_r3/local/score_sclite.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012, +# Brno University of Technology (Author: Karel Vesely) 2014, +# Apache 2.0 +# + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +beam=7 # speed-up, but may affect MBR confidences. +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +dir=$3 + +model=$dir/../$iter.mdl # assume model one level up from decoding dir. + +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; +hubdir=`dirname $hubscr` + +for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \ + $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; +done + +# name=`basename $data`; # e.g. eval2000 +nj=$(cat $dir/num_jobs) + +mkdir -p $dir/scoring/log + +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + +if [ $stage -le 0 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + set -e -o pipefail \; \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-align-words --output-error-lats=true --max-expand=10.0 --test=false \ + $lang/phones/word_boundary.int $model ark:- ark:- \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \| \ + sort -k1,1 -k2,2 -k3,3nb '>' $dir/score_LMWT_${wip}/ctm || exit 1; + done +fi + +if [ $stage -le 1 ]; then + # Remove some stuff we don't want to score, from the ctm. + for x in $dir/score_*/ctm; do + # `-i` is not needed in the following. It is added for robustness in ase this code is copy-pasted + # into another script that, e.g., uses instead of + grep -v -w -i '' <$x > ${x}.filt || exit 1; + done +fi + +# Score the set... +if [ $stage -le 2 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ + cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/ctm.filt || exit 1; + done +fi + +exit 0 diff --git a/egs/tedlium/s5_r3/local/ted_train_lm.sh b/egs/tedlium/s5_r3/local/ted_train_lm.sh index 20ea2ca3216..3c587f63094 100755 --- a/egs/tedlium/s5_r3/local/ted_train_lm.sh +++ b/egs/tedlium/s5_r3/local/ted_train_lm.sh @@ -58,7 +58,7 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true - # Unzip TEDLIUM 6 data sources, normalize apostrophe+suffix to previous word, gzip the result. + # Unzip TEDLIUM 6 data sources, remove , gzip the result. gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c > ${dir}/data/text/train.txt.gz # use a subset of the annotated training data as the dev set . # Note: the name 'dev' is treated specially by pocolm, it automatically From 677f7542e66fa4d7b496d9ca7ff2e3420f967d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 22 Jun 2018 12:17:15 +0200 Subject: [PATCH 30/35] fix ted_download_rnnlm script --- egs/tedlium/s5_r3/local/ted_download_rnnlm.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh index 609f0194541..431d44c6ff6 100755 --- a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh +++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh @@ -11,7 +11,12 @@ set -e echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)" wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1 -tar -xvzf exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz || exit 1 -rm exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz +cd exp/rnnlm_lstm_tdnn_a_averaged +tar -xvzf tedlium_rnnlm.tgz || exit 1 +rm tedlium_rnnlm.tgz +mkdir config +cd ../.. +cp data/lang/words.txt exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt +echo " 152217" >> exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt -exit 0 \ No newline at end of file +exit 0 From b1d93006a1d60551ace9d77503094c026e7d6cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 22 Jun 2018 12:17:32 +0200 Subject: [PATCH 31/35] fix rnnlm rescoring in run.sh --- egs/tedlium/s5_r3/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh index 74904fbd1ac..d4f3a38fd49 100755 --- a/egs/tedlium/s5_r3/run.sh +++ b/egs/tedlium/s5_r3/run.sh @@ -206,8 +206,8 @@ if [ $stage -le 19 ]; then ngram_order=4 for dset in dev test; do - data_dir=data/${set}_hires - decoding_dir=exp/chain/tdnnf_1a + data_dir=data/${dset}_hires + decoding_dir=exp/chain_cleaned/tdnnf_1a suffix=$(basename $rnnlm_dir) output_dir=${decoding_dir}_$suffix From bf6807154e58f1f55fa27d43c9bcaee7bd73581b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 22 Jun 2018 12:18:23 +0200 Subject: [PATCH 32/35] add both sclite and score_basic scores in tdnnf script --- .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh index cb468748b47..d807c636ace 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh @@ -4,15 +4,18 @@ # It use 2 to 6 jobs and add proportional-shrink 10. # local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnnf_1a -# System tdnn_1a tdnnf_1a -# WER on dev(orig) 8.2 7.9 -# WER on dev(rescored) 7.6 7.2 -# WER on test(orig) 8.1 8.0 -# WER on test(rescored) 7.7 7.5 -# Final train prob -0.0802 -0.0779 -# Final valid prob -0.0980 -0.0906 -# Final train prob (xent) -1.1450 -0.9021 -# Final valid prob (xent) -1.2498 -0.9971 +# System tdnn_1a tdnnf_1a tdnnf_1a +# Scoring script sclite sclite score_basic +# WER on dev(orig) 8.2 7.9 7.9 +# WER on dev(rescored ngram) 7.6 7.4 7.5 +# WER on dev(rescored rnnlm) 6.3 6.2 6.2 +# WER on test(orig) 8.1 8.0 8.2 +# WER on test(rescored ngram) 7.7 7.7 7.9 +# WER on test(rescored rnnlm) 6.7 6.7 6.8 +# Final train prob -0.0802 -0.0899 +# Final valid prob -0.0980 -0.0974 +# Final train prob (xent) -1.1450 -0.9449 +# Final valid prob (xent) -1.2498 -1.0002 ## how you run this (note: this assumes that the run_tdnn.sh soft link points here; @@ -64,6 +67,7 @@ where "nvcc" is installed. EOF fi + local/nnet3/run_ivector_common.sh --stage $stage \ --nj $nj \ --min-seg-len $min_seg_len \ @@ -217,8 +221,6 @@ if [ $stage -le 18 ]; then --dir $dir fi - - if [ $stage -le 19 ]; then # Note: it might appear that this data/lang_chain directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from From 75e9d60ca5982ac27ba3e2d94fd97bc540f972cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 28 Jun 2018 12:39:30 +0200 Subject: [PATCH 33/35] some fix to tdnn scripts --- egs/tedlium/s5_r3/cmd.sh | 2 +- .../s5_r3/local/chain/compare_wer_general.sh | 111 ++++++++ .../s5_r3/local/chain/tuning/run_tdnn_1a.sh | 18 +- .../s5_r3/local/chain/tuning/run_tdnn_1b.sh | 251 ++++++++++++++++++ 4 files changed, 380 insertions(+), 2 deletions(-) create mode 100755 egs/tedlium/s5_r3/local/chain/compare_wer_general.sh create mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh index 23a2b7b6a51..56c1d783a9e 100755 --- a/egs/tedlium/s5_r3/cmd.sh +++ b/egs/tedlium/s5_r3/cmd.sh @@ -12,4 +12,4 @@ # JHU cluster (or most clusters using GridEngine, with a suitable # conf/queue.conf). export train_cmd="queue.pl" -export decode_cmd="queue.pl --mem 4G" \ No newline at end of file +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..88dde1ff0e2 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} + + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain_cleaned/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=("# WER on dev(orig) " "# WER on dev(rescored) " "# WER on test(orig) " "# WER on test(rescored)") + +for n in 0 1 2 3; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore) + wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore) + wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh index e6613529ab6..40cdcb5b5ff 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -1,6 +1,22 @@ #!/bin/bash -# See run_tdnnf_1a.sh for comparative results. +# Results + +# System tdnn_1a +# Scoring script sclite +# WER on dev(orig) 8.2 +# WER on dev(rescored ngram) 7.6 +# WER on dev(rescored rnnlm) 6.3 +# WER on test(orig) 8.1 +# WER on test(rescored ngram) 7.7 +# WER on test(rescored rnnlm) 6.7 +# Final train prob -0.0802 +# Final valid prob -0.0980 +# Final train prob (xent) -1.1450 +# Final valid prob (xent) -1.2498 +# Num-params 26651840 + + ## how you run this (note: this assumes that the run_tdnn.sh soft link points here; ## otherwise call it directly in its location). diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..2d1506f713c --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,251 @@ +#!/bin/bash + +# run_tdnn_1b.sh is the script which results are presented in the corpus release paper. +# It use 2 to 6 jobs and add proportional-shrink 10. + +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnn_1b +# System tdnn_1a tdnn_1b tdnn_1b +# Scoring script sclite sclite score_basic +# WER on dev(orig) 8.2 7.9 7.9 +# WER on dev(rescored ngram) 7.6 7.4 7.5 +# WER on dev(rescored rnnlm) 6.3 6.2 6.2 +# WER on test(orig) 8.1 8.0 8.2 +# WER on test(rescored ngram) 7.7 7.7 7.9 +# WER on test(rescored rnnlm) 6.7 6.7 6.8 +# Final train prob -0.0802 -0.0899 +# Final valid prob -0.0980 -0.0974 +# Final train prob (xent) -1.1450 -0.9449 +# Final valid prob (xent) -1.2498 -1.0002 +# Num-params 26651840 25782720 + + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnnf_affix=_1a #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1280 + linear-component name=tdnn2l dim=256 input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 + relu-batchnorm-layer name=tdnn3 dim=1280 + linear-component name=tdnn4l dim=256 input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 + relu-batchnorm-layer name=tdnn5 dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 + relu-batchnorm-layer name=prefinal-chain input=prefinal-l dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets + relu-batchnorm-layer name=prefinal-xent input=prefinal-l dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 6 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 From 6095425e2ea886e5fc6b7fe1f5f7e66134151a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 28 Jun 2018 12:43:26 +0200 Subject: [PATCH 34/35] minor fix preparation scripts --- egs/tedlium/s5_r3/local/prepare_data.sh | 2 -- egs/tedlium/s5_r3/local/prepare_dict.sh | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh index 8de1752742b..c4b911601e5 100755 --- a/egs/tedlium/s5_r3/local/prepare_data.sh +++ b/egs/tedlium/s5_r3/local/prepare_data.sh @@ -11,8 +11,6 @@ . ./path.sh -export LC_ALL=C - # Prepare: test, train, for set in dev test train; do dir=data/$set.orig diff --git a/egs/tedlium/s5_r3/local/prepare_dict.sh b/egs/tedlium/s5_r3/local/prepare_dict.sh index 3cdbcb3fdf6..204b3f910e5 100755 --- a/egs/tedlium/s5_r3/local/prepare_dict.sh +++ b/egs/tedlium/s5_r3/local/prepare_dict.sh @@ -10,7 +10,7 @@ dir=data/local/dict_nosp mkdir -p $dir -srcdict=db//TEDLIUM_release-3/TEDLIUM.152k.dic +srcdict=db/TEDLIUM_release-3/TEDLIUM.152k.dic [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 From 285b3896dac0c65a7ec6cf1dc7deed4179d09198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 12 Jul 2018 21:22:10 +0200 Subject: [PATCH 35/35] add warning tdnnf setup --- egs/tedlium/s5_r3/local/chain/run_tdnnf.sh | 2 +- .../s5_r3/local/chain/tuning/run_tdnn_1b.sh | 8 +- .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 250 ------------------ 3 files changed, 8 insertions(+), 252 deletions(-) delete mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh index cbbf0ed6533..61f8f499182 120000 --- a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh +++ b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh @@ -1 +1 @@ -tuning/run_tdnnf_1a.sh \ No newline at end of file +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh index 2d1506f713c..f8eec8c5213 100755 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh @@ -1,7 +1,13 @@ #!/bin/bash # run_tdnn_1b.sh is the script which results are presented in the corpus release paper. -# It use 2 to 6 jobs and add proportional-shrink 10. +# It uses 2 to 6 jobs and add proportional-shrink 10. + +# WARNING +# This script is flawed and misses key elements to optimize the tdnnf setup. +# You can run it as is to reproduce results from the corpus release paper, +# but a more up-to-date version should be looked at in other egs until another +# setup is added here. # local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnn_1b # System tdnn_1a tdnn_1b tdnn_1b diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh deleted file mode 100755 index d807c636ace..00000000000 --- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh +++ /dev/null @@ -1,250 +0,0 @@ -#!/bin/bash - -# run_tdnnf_1a.sh is the script which results are presented in the corpus release paper. -# It use 2 to 6 jobs and add proportional-shrink 10. - -# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnnf_1a -# System tdnn_1a tdnnf_1a tdnnf_1a -# Scoring script sclite sclite score_basic -# WER on dev(orig) 8.2 7.9 7.9 -# WER on dev(rescored ngram) 7.6 7.4 7.5 -# WER on dev(rescored rnnlm) 6.3 6.2 6.2 -# WER on test(orig) 8.1 8.0 8.2 -# WER on test(rescored ngram) 7.7 7.7 7.9 -# WER on test(rescored rnnlm) 6.7 6.7 6.8 -# Final train prob -0.0802 -0.0899 -# Final valid prob -0.0980 -0.0974 -# Final train prob (xent) -1.1450 -0.9449 -# Final valid prob (xent) -1.2498 -1.0002 - - -## how you run this (note: this assumes that the run_tdnn.sh soft link points here; -## otherwise call it directly in its location). -# by default, with cleanup: -# local/chain/run_tdnn.sh - -# without cleanup: -# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & - -# note, if you have already run the corresponding non-chain nnet3 system -# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. - - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -decode_nj=30 -min_seg_len=1.55 -xent_regularize=0.1 -train_set=train_cleaned -gmm=tri3_cleaned # the gmm for the target data -num_threads_ubm=32 -nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -train_stage=-10 -tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnnf_affix=_1a #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir= # you can set this to use previously dumped egs. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <data/lang_chain/topo - fi -fi - -if [ $stage -le 15 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 16 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir -fi - -if [ $stage -le 17 ]; then - mkdir -p $dir - - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 $opts dim=1280 - linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn3l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn3 $opts dim=1280 - linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn5l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) - linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 - linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 - linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 - linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 - linear-component name=prefinal-l dim=256 $linear_opts - relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 - output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ - -fi - -if [ $stage -le 18 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize 0.1 \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ - --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ - --trainer.optimization.proportional-shrink 10 \ - --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 6 \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs false \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir $lat_dir \ - --dir $dir -fi - -if [ $stage -le 19 ]; then - # Note: it might appear that this data/lang_chain directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph -fi - -if [ $stage -le 20 ]; then - rm $dir/.error 2>/dev/null || true - for dset in dev test; do - ( - steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ - --acwt 1.0 --post-decode-acwt 10.0 \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ - --scoring-opts "--min-lmwt 5 " \ - $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ - data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 - ) || touch $dir/.error & - done - wait - if [ -f $dir/.error ]; then - echo "$0: something went wrong in decoding" - exit 1 - fi -fi -exit 0