From a46da4e6a5938692dce94057f4fb6fec2ff60f32 Mon Sep 17 00:00:00 2001 From: Jan Trmal Date: Thu, 14 Dec 2017 16:22:46 -0500 Subject: [PATCH 01/10] Adding chime5 baseline recipe --- egs/chime5/README.txt | 10 + egs/chime5/s5/RESULTS | 10 + egs/chime5/s5/cmd.sh | 15 + egs/chime5/s5/conf/chime5.cfg | 50 ++++ egs/chime5/s5/conf/decode.config | 2 + egs/chime5/s5/conf/mfcc.conf | 2 + egs/chime5/s5/conf/mfcc_hires.conf | 10 + egs/chime5/s5/conf/online_cmvn.conf | 1 + egs/chime5/s5/local/chain/compare_wer.sh | 131 ++++++++ egs/chime5/s5/local/chain/run_tdnn.sh | 1 + .../s5/local/chain/tuning/run_tdnn_1e.sh | 283 ++++++++++++++++++ egs/chime5/s5/local/check_tools.sh | 46 +++ egs/chime5/s5/local/json2text.py | 78 +++++ egs/chime5/s5/local/nnet3/compare_wer.sh | 132 ++++++++ .../s5/local/nnet3/run_ivector_common.sh | 149 +++++++++ egs/chime5/s5/local/prepare_data.sh | 117 ++++++++ egs/chime5/s5/local/prepare_dict.sh | 130 ++++++++ egs/chime5/s5/local/run_beamformit.sh | 89 ++++++ egs/chime5/s5/local/score.sh | 1 + egs/chime5/s5/local/train_lms_srilm.sh | 270 +++++++++++++++++ egs/chime5/s5/local/wer_output_filter | 25 ++ egs/chime5/s5/path.sh | 7 + egs/chime5/s5/run.sh | 237 +++++++++++++++ egs/chime5/s5/steps | 1 + egs/chime5/s5/utils | 1 + 25 files changed, 1798 insertions(+) create mode 100644 egs/chime5/README.txt create mode 100644 egs/chime5/s5/RESULTS create mode 100644 egs/chime5/s5/cmd.sh create mode 100755 egs/chime5/s5/conf/chime5.cfg create mode 100644 egs/chime5/s5/conf/decode.config create mode 100644 egs/chime5/s5/conf/mfcc.conf create mode 100644 egs/chime5/s5/conf/mfcc_hires.conf create mode 100644 egs/chime5/s5/conf/online_cmvn.conf create mode 100755 egs/chime5/s5/local/chain/compare_wer.sh create mode 120000 egs/chime5/s5/local/chain/run_tdnn.sh create mode 100755 egs/chime5/s5/local/chain/tuning/run_tdnn_1e.sh create mode 100755 egs/chime5/s5/local/check_tools.sh create mode 100755 egs/chime5/s5/local/json2text.py create mode 100755 egs/chime5/s5/local/nnet3/compare_wer.sh create mode 100755 egs/chime5/s5/local/nnet3/run_ivector_common.sh create mode 100755 egs/chime5/s5/local/prepare_data.sh create mode 100755 egs/chime5/s5/local/prepare_dict.sh create mode 100755 egs/chime5/s5/local/run_beamformit.sh create mode 120000 egs/chime5/s5/local/score.sh create mode 100755 egs/chime5/s5/local/train_lms_srilm.sh create mode 100755 egs/chime5/s5/local/wer_output_filter create mode 100644 egs/chime5/s5/path.sh create mode 100755 egs/chime5/s5/run.sh create mode 120000 egs/chime5/s5/steps create mode 120000 egs/chime5/s5/utils diff --git a/egs/chime5/README.txt b/egs/chime5/README.txt new file mode 100644 index 00000000000..771857f9433 --- /dev/null +++ b/egs/chime5/README.txt @@ -0,0 +1,10 @@ +This is a kaldi recipe for the 5th CHiME Speech Separation and Recognition Challenge (CHiME-5). + +The ChiME-5 challenge will consider the problem of distant multi-microphone +conversational speech recognition in everyday home environments. Speech material +was elicited using a dinner party scenario with efforts taken to capture data +that is representative of natural conversational speech. + +See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information. + +s5 : Default recipe diff --git a/egs/chime5/s5/RESULTS b/egs/chime5/s5/RESULTS new file mode 100644 index 00000000000..b57787a0798 --- /dev/null +++ b/egs/chime5/s5/RESULTS @@ -0,0 +1,10 @@ + +# tri2 +%WER 92.26 [ 60741 / 65835, 3212 ins, 35241 del, 22288 sub ] exp/tri2/decode_dev_beamformit_ref/wer_16_1.0 +%WER 76.47 [ 50342 / 65835, 4356 ins, 19004 del, 26982 sub ] exp/tri2/decode_dev_worn/wer_14_1.0 + +# tri3 +%WER 92.43 [ 60852 / 65835, 3149 ins, 35536 del, 22167 sub ] exp/tri3/decode_dev_beamformit_ref.si/wer_17_1.0 +%WER 90.80 [ 59779 / 65835, 4742 ins, 27968 del, 27069 sub ] exp/tri3/decode_dev_beamformit_ref/wer_17_1.0 +%WER 76.38 [ 50283 / 65835, 3911 ins, 19081 del, 27291 sub ] exp/tri3/decode_dev_worn.si/wer_17_1.0 +%WER 73.13 [ 48146 / 65835, 4727 ins, 17274 del, 26145 sub ] exp/tri3/decode_dev_worn/wer_16_1.0 diff --git a/egs/chime5/s5/cmd.sh b/egs/chime5/s5/cmd.sh new file mode 100644 index 00000000000..a697a22cda3 --- /dev/null +++ b/egs/chime5/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" + diff --git a/egs/chime5/s5/conf/chime5.cfg b/egs/chime5/s5/conf/chime5.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime5/s5/conf/chime5.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime5/s5/conf/decode.config b/egs/chime5/s5/conf/decode.config new file mode 100644 index 00000000000..1940883b2f7 --- /dev/null +++ b/egs/chime5/s5/conf/decode.config @@ -0,0 +1,2 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. diff --git a/egs/chime5/s5/conf/mfcc.conf b/egs/chime5/s5/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/chime5/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/chime5/s5/conf/mfcc_hires.conf b/egs/chime5/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/chime5/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/chime5/s5/conf/online_cmvn.conf b/egs/chime5/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/chime5/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/chime5/s5/local/chain/compare_wer.sh b/egs/chime5/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..cd6be14ed88 --- /dev/null +++ b/egs/chime5/s5/local/chain/compare_wer.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/chime5/s5/local/chain/run_tdnn.sh b/egs/chime5/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..75da1a0a553 --- /dev/null +++ b/egs/chime5/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1e.sh \ No newline at end of file diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1e.sh new file mode 100755 index 00000000000..ba8779bcc77 --- /dev/null +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1e.sh @@ -0,0 +1,283 @@ +#!/bin/bash + +# 1e is as 1d but instead of the --proportional-shrink option, using +# the newly added xconfig-layer-specific 'l2-regularize' options. + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=96 +train_set=train_worn_u100k +test_sets="dev_worn eval_worn dev_beamformit_ref eval_beamformit_ref" +gmm=tri3 +nnet3_affix=_train_worn_u100k +lm_suffix=_chime5_tg + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1e # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.01" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=512 + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=512 + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=512 + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh + +uconv=`command -v uconv 2>/dev/null` \ + || { echo >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; } + +srilm=`command -v ngram 2>/dev/null` \ + || { echo >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; } + +sox=`command -v sox 2>/dev/null` \ + || { echo >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; } + +# If sox is found on path, check if the version is correct +if [ ! -z "$sox" ]; then + sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'` + if [[ ! $sox_version =~ v14.4.* ]]; then + echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher." + exit 1 + fi +fi + +phalign=`command -v phonetisaurus-align 2>/dev/null` \ + || { echo >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; } + +beamformit=`command -v BeamformIt 2>/dev/null` \ + || { echo >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; } + +exit 0 + + diff --git a/egs/chime5/s5/local/json2text.py b/egs/chime5/s5/local/json2text.py new file mode 100755 index 00000000000..a3b81fd7067 --- /dev/null +++ b/egs/chime5/s5/local/json2text.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import json +import argparse +import logging +import sys + + +def hms_to_seconds(hms): + hour = hms.split(':')[0] + minute = hms.split(':')[1] + second = hms.split(':')[2].split('.')[0] + + # .xx (10 ms order) + ms10 = hms.split(':')[2].split('.')[1] + + # total seconds + seconds = int(hour) * 3600 + int(minute) * 60 + int(second) + + return '{:07d}'.format(int(str(seconds) + ms10)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('json', type=str, help='JSON transcription file') + parser.add_argument('--mictype', type=str, + choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'], + help='Type of microphones') + args = parser.parse_args() + + # logging info + log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s" + logging.basicConfig(level=logging.INFO, format=log_format) + + logging.debug("reading %s", args.json) + with open(args.json, 'rt', encoding="utf-8") as f: + j = json.load(f) + + for x in j: + if '[redacted]' not in x['words']: + session_id = x['session_id'] + speaker_id = x['speaker'] + if args.mictype == 'ref': + mictype = x['ref'] + elif args.mictype == 'worn': + mictype = 'original' + else: + mictype = args.mictype.upper() # convert from u01 to U01 + + start_time = x['start_time'][mictype] + end_time = x['end_time'][mictype] + + # remove meta chars and convert to lower + words = x['words'].replace('"', '')\ + .replace('.', '')\ + .replace('?', '')\ + .replace(',', '')\ + .replace(':', '')\ + .replace(';', '')\ + .replace('!', '').lower() + + # remove multiple spaces + words = " ".join(words.split()) + + # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55 + start_time = hms_to_seconds(start_time) + end_time = hms_to_seconds(end_time) + + if args.mictype == 'worn': + uttid = speaker_id + '_' + session_id + '-' + start_time + '-' + end_time + else: + uttid = speaker_id + '_' + session_id + '_' + mictype + '-' + start_time + '-' + end_time + + if end_time > start_time: + sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8")) diff --git a/egs/chime5/s5/local/nnet3/compare_wer.sh b/egs/chime5/s5/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..095e85cc338 --- /dev/null +++ b/egs/chime5/s5/local/nnet3/compare_wer.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo diff --git a/egs/chime5/s5/local/nnet3/run_ivector_common.sh b/egs/chime5/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..58f29f479bc --- /dev/null +++ b/egs/chime5/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train_worn_u100k +test_sets="dev_worn eval_worn dev_beamformit_ref eval_beamformit_ref" +gmm=tri3 +nj=96 + +nnet3_affix=_train_worn_u100k + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l &2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" + exit 1 +fi + +adir=$1 +jdir=$2 +dir=$3 + +echo "$0: Converting transcription to text" + +mkdir -p $dir +for file in $jdir/*json; do + ./local/json2text.py --mictype $mictype $file +done | \ + sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ + sed -e 's/ - / /g' |\ + sed -e 's/mm-/mm/g' > $dir/text.orig + +echo "$0: Creating datadir $dir for type=\"$mictype\"" + +if [ $mictype == "worn" ]; then + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key, add _L and _R for left and right channel + # i.e. each file will have two entries (left and right channel) + find $adir -name "S[0-9]*_P[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + @F = split "_", $f; + print "${F[1]}_${F[0]}_L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}_R sox $path -t wav - remix 2 |\n"; + }' | sort > $dir/wav.scp + + # generate the transcripts for both left and right channel + # from the original transcript in the form + # P09_S03-0006072-0006147 gimme the baker + # create left and right channel transcript + # P09_S03_L-0006072-0006147 gimme the baker + # P09_S03_R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/_L-/p; g; s/-/_R-/p' $dir/text.orig | sort > $dir/text +elif [ $mictype == "ref" ]; then + # fixed reference array + + # first get a text, which will be used to extract reference arrays + perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text + + find $adir | grep "\.wav" | sort > $dir/wav.flist + # following command provide the argument for grep to extract only reference arrays + grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 + paste -d" " \ + <(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \ + $dir/wav.flist2 | sort > $dir/wav.scp +else + # array mic case + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key + find $adir -name "*.wav" -ipath "*${mictype}*" |\ + perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ + sort -u > $dir/wav.scp + + # convert the transcripts from + # P09_S03-0006072-0006147 gimme the baker + # to the per-channel transcripts + # P09_S03_U01.CH1-0006072-0006147 gimme the baker + # P09_S03_U01.CH2-0006072-0006147 gimme the baker + # P09_S03_U01.CH3-0006072-0006147 gimme the baker + # P09_S03_U01.CH4-0006072-0006147 gimme the baker + perl -ne '$l=$_; + for($i=1; $i<=4; $i++) { + ($x=$l)=~ s/-/.CH\Q$i\E-/; + print $x;}' $dir/text.orig | sort > $dir/text + +fi +$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist + +# Prepare 'segments', 'utt2spk', 'spk2utt' +if [ $mictype == "worn" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' \ + > $dir/segments +elif [ $mictype == "ref" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/ P.._/ /" > $dir/segments +else + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e 's/ P.._/ /' > $dir/segments +fi +cut -f 1 -d ' ' $dir/segments | \ + perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk + +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +# Check that data dirs are okay! +utils/validate_data_dir.sh --no-feats $dir || exit 1 diff --git a/egs/chime5/s5/local/prepare_dict.sh b/egs/chime5/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..31d5ff9c77c --- /dev/null +++ b/egs/chime5/s5/local/prepare_dict.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +. ./utils/parse_options.sh + +. ./path.sh + +set -e -o pipefail +set -o nounset # Treat unset variables as an error + + +# The parts of the output of this that will be needed are +# [in data/local/dict/ ] +# lexicon.txt +# extra_questions.txt +# nonsilence_phones.txt +# optional_silence.txt +# silence_phones.txt + + +# check existing directories +[ $# != 0 ] && echo "Usage: $0" && exit 1; + +# This script also needs the phonetisaurus g2p, srilm,subversion, +# and ICU4C installed. We test for these things during the kaldi instalation +# and during when the master script is run, so we do not run any tests here. +. ./local/check_tools.sh + +dir=data/local/dict_nosp + +mkdir -p $dir +echo "$0: Getting CMU dictionary" +if [ ! -f $dir/cmudict.done ]; then + [ -d $dir/cmudict ] && rm -rf $dir/cmudict + svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict + touch $dir/cmudict.done +fi + +# silence phones, one per line. +for w in sil spn inaudible laughs noise; do + echo $w; +done > $dir/silence_phones.txt +echo sil > $dir/optional_silence.txt + +# For this setup we're discarding stress. +cat $dir/cmudict/cmudict-0.7b.symbols | \ + perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \ + sort -u > $dir/nonsilence_phones.txt + +# An extra question will be added by including the silence phones in one class. +paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt + +grep -v ';;;' $dir/cmudict/cmudict-0.7b |\ + uconv -f latin1 -t utf-8 -x Any-Lower |\ + perl -ne 's:(\S+)\(\d+\) :$1 :; s: : :; print;' |\ + perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \ + > $dir/lexicon1_raw_nosil.txt || exit 1; + +# Add prons for laughter, noise, oov +for w in `grep -v sil $dir/silence_phones.txt`; do + echo "[$w] $w" +done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; + +# we keep all words from the cmudict in the lexicon +# might reduce OOV rate on dev and eval +cat $dir/lexicon2_raw.txt \ + <( echo "mm m" + echo " spn" + echo "cuz k aa z" + echo "cuz k ah z" + echo "cuz k ao z" + echo "mmm m"; \ + echo "hmm hh m"; \ + ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt + + +cat data/train*/text | \ + awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ + sort -nr > $dir/word_counts + +cat $dir/word_counts | awk '{print $2}' > $dir/word_list + +awk '{print $1}' $dir/iv_lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.txt + +set -x +echo "*Highest-count OOVs (including fragments) are:" +head -n 10 $dir/oov_counts.txt +echo "*Highest-count OOVs (excluding fragments) are:" +grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true + +echo "*Training a G2P and generating missing pronunciations" +mkdir -p $dir/g2p/ +phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus +ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\ + -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \ + -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa +phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst +awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt +phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \ + --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt + +## The next section is again just for debug purposes +## to show words for which the G2P failed +cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt +rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists. +awk '{print $1}' $dir/lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.g2p.txt + +echo "*Highest-count OOVs (including fragments) after G2P are:" +head -n 10 $dir/oov_counts.g2p.txt + +utils/validate_dict_dir.pl $dir +exit 0; + diff --git a/egs/chime5/s5/local/run_beamformit.sh b/egs/chime5/s5/local/run_beamformit.sh new file mode 100755 index 00000000000..78f740339fa --- /dev/null +++ b/egs/chime5/s5/local/run_beamformit.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +. ./path.sh + +# Config: +cmd=run.pl +bmf="1 2 3 4" + +. utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_beamformit.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --bmf \"1 2 3 4\" # microphones used for beamforming" + exit 1; +fi + +sdir=$1 +odir=$2 +array=$3 +expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'` + +if [ -z $BEAMFORMIT ] ; then + export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt +fi +export PATH=${PATH}:$BEAMFORMIT +! hash BeamformIt && echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $expdir/log + +echo "Will use the following channels: $bmf" +# number of channels +numch=`echo $bmf | tr ' ' '\n' | wc -l` +echo "the number of channels: $numch" + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$expdir/wavfiles.list +find ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list + +# this is an input file list of the microphones +# format: 1st_wav 2nd_wav ... nth_wav +input_arrays=$expdir/channels_$numch +for x in `cat $output_wavfiles`; do + echo -n "$x" + for ch in $bmf; do + echo -n " $x.CH$ch.wav" + done + echo "" +done > $input_arrays + +# split the list for parallel processing +# number of jobs are set by the number of WAV files +nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'` +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $expdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/chime5.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/beamform.*.sh +$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \ + $expdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime5/s5/local/score.sh b/egs/chime5/s5/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/chime5/s5/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/chime5/s5/local/train_lms_srilm.sh b/egs/chime5/s5/local/train_lms_srilm.sh new file mode 100755 index 00000000000..09bba818ba6 --- /dev/null +++ b/egs/chime5/s5/local/train_lms_srilm.sh @@ -0,0 +1,270 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe) +# Apache 2.0 + +export LC_ALL=C + +# Begin configuration section. +words_file= +train_text= +dev_text= +oov_symbol="" +# End configuration section + +echo "$0 $@" + +[ -f path.sh ] && . ./path.sh +. ./utils/parse_options.sh || exit 1 + +echo "-------------------------------------" +echo "Building an SRILM language model " +echo "-------------------------------------" + +if [ $# -ne 2 ] ; then + echo "Incorrect number of parameters. " + echo "Script has to be called like this:" + echo " $0 [switches] " + echo "For example: " + echo " $0 data data/srilm" + echo "The allowed switches are: " + echo " words_file= word list file -- data/lang/words.txt by default" + echo " train_text= data/train/text is used in case when not specified" + echo " dev_text= last 10 % of the train text is used by default" + echo " oov_symbol=> symbol to use for oov modeling -- by default" + exit 1 +fi + +datadir=$1 +tgtdir=$2 + +##End of configuration +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=`pwd`/../../../tools/srilm/bin/i686-m64 + else + sdir=`pwd`/../../../tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +# Prepare the destination directory +mkdir -p $tgtdir + +for f in $words_file $train_text $dev_text; do + [ ! -s $f ] && echo "No such file $f" && exit 1; +done + +[ -z $words_file ] && words_file=$datadir/lang/words.txt +if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then + nr=`cat $train_text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + orig_train_text=$train_text + head -n $nr_train $train_text > $tgtdir/train_text + tail -n $nr_dev $train_text > $tgtdir/dev_text + + train_text=$tgtdir/train_text + dev_text=$tgtdir/dev_text + echo "Using words file: $words_file" + echo "Using train text: 9/10 of $orig_train_text" + echo "Using dev text : 1/10 of $orig_train_text" +elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + train_text=$train_text + dev_text=$dev_text +else + train_text=$datadir/train/text + dev_text=$datadir/dev2h/text + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + +fi + +[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1 +[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1 +[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1 + + +# Extract the word list from the training dictionary; exclude special symbols +sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '' | grep -v -F "$oov_symbol" > $tgtdir/vocab +if (($?)); then + echo "Failed to create vocab from $words_file" + exit 1 +else + # wc vocab # doesn't work due to some encoding issues + echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +# We also have to avoid skewing the LM by incorporating the same sentences +# from different channels +sed -e "s/\.CH.//" -e "s/_.\-./_/" $train_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt +if (($?)); then + echo "Failed to create $tgtdir/train.txt from $train_text" + exit 1 +else + echo "Removed first word (uid) from every line of $train_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +set -x +# Kaldi transcript files contain Utterance_ID as the first word; remove it +sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \ + perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt +if (($?)); then + echo "Failed to create $tgtdir/dev.txt from $dev_text" + exit 1 +else + echo "Removed first word (uid) from every line of $dev_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + + +echo "-------------------" +echo "Good-Turing 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \ + -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + + +echo "-------------------" +echo "Good-Turing 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.gt0111.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0112.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0122.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0123.gz \ + -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0113.gz \ + -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0222.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0223.gz \ + -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.kn0111.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0112.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0113.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0122.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0123.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0222.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0223.gz \ + -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \ + -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +if [ ! -z ${LIBLBFGS} ]; then + #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault + #instead of that, we simply output the model in the maxent format and convert it using the "ngram" + echo "-------------------" + echo "Maxent 3grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 4grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1 +else + echo "Skipping MaxEnt models" +fi + + +echo "--------------------" +echo "Computing perplexity" +echo "--------------------" +( + for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done + for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done +) | sort -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt + +echo "The perlexity scores report is stored in $tgtdir/perplexities.txt " +echo "" + +for best_ngram in {3,4}gram ; do + outlm=best_${best_ngram}.gz + lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ') + echo "$outlm -> $lmfilename" + (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm ) +done diff --git a/egs/chime5/s5/local/wer_output_filter b/egs/chime5/s5/local/wer_output_filter new file mode 100755 index 00000000000..6f4b6400716 --- /dev/null +++ b/egs/chime5/s5/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal ) +# Apache 2.0 + + +## Filter for scoring of the STT results. Convert everything to lowercase +## and add some ad-hoc fixes for the hesitations + +perl -e ' + while() { + @A = split(" ", $_); + $id = shift @A; print "$id "; + foreach $a (@A) { + print lc($a) . " " unless $a =~ /\[.*\]/; + } + print "\n"; + }' | \ +sed -e ' + s/\/hmm/g; + s/\/hmm/g; + s/\/hmm/g; +' + +#| uconv -f utf-8 -t utf-8 -x Latin-ASCII + diff --git a/egs/chime5/s5/path.sh b/egs/chime5/s5/path.sh new file mode 100644 index 00000000000..fb1c0489386 --- /dev/null +++ b/egs/chime5/s5/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh new file mode 100755 index 00000000000..c3b724fe5b4 --- /dev/null +++ b/egs/chime5/s5/run.sh @@ -0,0 +1,237 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# + +# Begin configuration section. +nj=96 +decode_nj=20 +stage=0 +enhancement=beamformit # for a new enhancement method, + # change this variable and stage 4 +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +json_dir=${chime5_corpus}/data/transcriptions +audio_dir=${chime5_corpus}/data/audio + +# training and test data +train_set=train_worn_u100k +test_sets="dev_worn dev_${enhancement}_ref" +# use the below once you obtain the evaluation data. Also remove the comment #eval# in the lines below +#eval#test_sets="dev_worn eval_worn dev_${enhancement}_ref eval_${enhancement}_ref" + +./local/check_tools.sh || exit 1 + +if [ $stage -le 1 ]; then + # skip u03 as they are missing + for mictype in worn u01 u02 u04 u05 u06; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/train ${json_dir}/train data/train_${mictype} + done + #eval#for dataset in dev eval; do + for dataset in dev; do + for mictype in worn; do + local/prepare_data.sh --mictype ${mictype} \ + ${audio_dir}/${dataset} ${json_dir}/${dataset} \ + data/${dataset}_${mictype} + done + done +fi + +if [ $stage -le 2 ]; then + local/prepare_dict.sh + + utils/prepare_lang.sh \ + data/local/dict_nosp "" data/local/lang_nosp data/lang_nosp + + local/train_lms_srilm.sh \ + --train-text data/train_worn/text --dev-text data/dev_worn/text \ + --oov-symbol "" --words-file data/lang_nosp/words.txt \ + data/ data/srilm +fi + +LM=data/srilm/best_3gram.gz +if [ $stage -le 3 ]; then + # Compiles G for chime5 trigram LM + utils/format_lm.sh \ + data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_test + +fi + +if [ $stage -le 4 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + #eval#for dset in dev eval; do + for dset in dev; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${audio_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + #eval#for dset in dev eval; do + for dset in dev; do + local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_ref + done +fi + +if [ $stage -le 5 ]; then + # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up + grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text + utils/fix_data_dir.sh data/train_worn + + # combine mix array and worn mics + # randomly extract first 100k utterances from all mics + # If you want to include more training data, you can increase the number of array mic utterances + utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06 + utils/subset_data_dir.sh data/train_uall 100000 data/train_u100k + utils/combine_data.sh data/${train_set} data/train_worn data/train_u100k + + # only use left channel for worn mic recognition + # you can use both left and right channels for training + #eval#for dset in train dev eval; do + for dset in train dev; do + utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo + grep "_L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text + utils/fix_data_dir.sh data/${dset}_worn + done +fi + +if [ $stage -le 6 ]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${train_set} ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset} + done +fi + +if [ $stage -le 7 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${train_set} ${test_sets}; do + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x + done +fi + +if [ $stage -le 8 ]; then + # make a subset for monophone training + utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort + utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort +fi + +if [ $stage -le 9 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_30kshort data/lang_nosp exp/mono +fi + +if [ $stage -le 10 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang_nosp exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/${train_set} data/lang_nosp exp/mono_ali exp/tri1 +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang_nosp exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/${train_set} data/lang_nosp exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 12 ]; then + utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp + for dset in ${test_sets}; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph_nosp data/${dset} exp/tri2/decode_${dset}_nosp + done +fi + +if [ $stage -le 13 ]; then + # create a more refined lexicon (include pronunciation probabilities) + steps/get_prons.sh --cmd "$train_cmd" \ + data/${train_set} data/lang_nosp exp/tri2 + + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ + exp/tri2/sil_counts_nowb.txt \ + exp/tri2/pron_bigram_counts_nowb.txt data/local/dict + + # add explicit phone loop for model + utils/lang/make_unk_lm.sh --use-pocolm false \ + data/local/dict exp/make_unk + + # and compile the lang directory + utils/prepare_lang.sh \ + --unk-fst exp/make_unk/unk_fst.txt \ + --phone-symbol-table data/lang_nosp/phones.txt \ + data/local/dict "" data/local/lang_test data/lang_test + + # and convert the LM in arpa to G.fst + utils/format_lm.sh \ + data/lang_test $LM data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 14 ]; then + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + for dset in ${test_sets}; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph data/${dset} exp/tri2/decode_${dset} & + done + wait +fi + +if [ $stage -le 15 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 16 ]; then + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + for dset in ${test_sets}; do + steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph data/${dset} exp/tri3/decode_${dset} & + done + wait +fi + +if [ $stage -le 17 ]; then + # The following script cleans the data and produces cleaned data + steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \ + --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ + data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned +fi + +if [ $stage -le 18 ]; then + # chain TDNN + local/chain/run_tdnn.sh --nj ${nj} --test_sets "$test_sets" +fi diff --git a/egs/chime5/s5/steps b/egs/chime5/s5/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/chime5/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/chime5/s5/utils b/egs/chime5/s5/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/chime5/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file From b09ee9c54bb28ee2b4334b871b7a4fd9352d613f Mon Sep 17 00:00:00 2001 From: Shinji Watanabe Date: Wed, 7 Mar 2018 15:56:51 -0500 Subject: [PATCH 02/10] [egs] fixed a data path and bug in data prep at chime5 --- egs/chime5/s5/local/prepare_dict.sh | 5 ----- egs/chime5/s5/run.sh | 5 +++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/egs/chime5/s5/local/prepare_dict.sh b/egs/chime5/s5/local/prepare_dict.sh index 31d5ff9c77c..468f2e915d0 100755 --- a/egs/chime5/s5/local/prepare_dict.sh +++ b/egs/chime5/s5/local/prepare_dict.sh @@ -24,11 +24,6 @@ set -o nounset # Treat unset variables as an error # check existing directories [ $# != 0 ] && echo "Usage: $0" && exit 1; -# This script also needs the phonetisaurus g2p, srilm,subversion, -# and ICU4C installed. We test for these things during the kaldi instalation -# and during when the master script is run, so we do not run any tests here. -. ./local/check_tools.sh - dir=data/local/dict_nosp mkdir -p $dir diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index c3b724fe5b4..077a60654c4 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -24,8 +24,8 @@ set -e # exit on error # chime5 main directory path # please change the path accordingly chime5_corpus=/export/corpora4/CHiME5 -json_dir=${chime5_corpus}/data/transcriptions -audio_dir=${chime5_corpus}/data/audio +json_dir=${chime5_corpus}/transcriptions +audio_dir=${chime5_corpus}/audio # training and test data train_set=train_worn_u100k @@ -33,6 +33,7 @@ test_sets="dev_worn dev_${enhancement}_ref" # use the below once you obtain the evaluation data. Also remove the comment #eval# in the lines below #eval#test_sets="dev_worn eval_worn dev_${enhancement}_ref eval_${enhancement}_ref" +# This script also needs the phonetisaurus g2p, srilm, beamformit ./local/check_tools.sh || exit 1 if [ $stage -le 1 ]; then From 6c6918947df0035e3502df522b2187def58acada Mon Sep 17 00:00:00 2001 From: Shinji Watanabe Date: Thu, 8 Mar 2018 08:51:35 -0500 Subject: [PATCH 03/10] [egs] fixed a wrong directory name in data/lang --- egs/chime5/s5/run.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index 077a60654c4..11824af2c31 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -179,28 +179,28 @@ if [ $stage -le 13 ]; then steps/get_prons.sh --cmd "$train_cmd" \ data/${train_set} data/lang_nosp exp/tri2 - utils/dict_dir_add_pronprobs.sh --max-normalize true \ - data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ exp/tri2/sil_counts_nowb.txt \ exp/tri2/pron_bigram_counts_nowb.txt data/local/dict # add explicit phone loop for model utils/lang/make_unk_lm.sh --use-pocolm false \ - data/local/dict exp/make_unk + data/local/dict exp/make_unk # and compile the lang directory utils/prepare_lang.sh \ --unk-fst exp/make_unk/unk_fst.txt \ - --phone-symbol-table data/lang_nosp/phones.txt \ - data/local/dict "" data/local/lang_test data/lang_test + --phone-symbol-table data/lang_nosp/phones.txt \ + data/local/dict "" data/local/lang data/lang # and convert the LM in arpa to G.fst utils/format_lm.sh \ - data/lang_test $LM data/local/dict/lexicon.txt data/lang_test + data/lang $LM data/local/dict/lexicon.txt data/lang fi if [ $stage -le 14 ]; then - utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph for dset in ${test_sets}; do steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ exp/tri2/graph data/${dset} exp/tri2/decode_${dset} & @@ -217,7 +217,7 @@ if [ $stage -le 15 ]; then fi if [ $stage -le 16 ]; then - utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph for dset in ${test_sets}; do steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ exp/tri3/graph data/${dset} exp/tri3/decode_${dset} & From 5a3b8b15c3e5ac05e615ed70b7972679ed51ffe1 Mon Sep 17 00:00:00 2001 From: Jan Trmal Date: Thu, 8 Mar 2018 11:05:56 -0500 Subject: [PATCH 04/10] fixing some issues --- .../s5/conf/{chime5.cfg => beamformit.cfg} | 0 egs/chime5/s5/conf/decode.config | 2 -- egs/chime5/s5/local/check_tools.sh | 15 ++++++++---- egs/chime5/s5/local/prepare_data.sh | 16 +++++++++++++ egs/chime5/s5/local/prepare_dict.sh | 1 - egs/chime5/s5/local/run_beamformit.sh | 8 +++---- egs/chime5/s5/local/train_lms_srilm.sh | 23 ++++++------------- 7 files changed, 37 insertions(+), 28 deletions(-) rename egs/chime5/s5/conf/{chime5.cfg => beamformit.cfg} (100%) delete mode 100644 egs/chime5/s5/conf/decode.config diff --git a/egs/chime5/s5/conf/chime5.cfg b/egs/chime5/s5/conf/beamformit.cfg similarity index 100% rename from egs/chime5/s5/conf/chime5.cfg rename to egs/chime5/s5/conf/beamformit.cfg diff --git a/egs/chime5/s5/conf/decode.config b/egs/chime5/s5/conf/decode.config deleted file mode 100644 index 1940883b2f7..00000000000 --- a/egs/chime5/s5/conf/decode.config +++ /dev/null @@ -1,2 +0,0 @@ -beam=11.0 # beam for decoding. Was 13.0 in the scripts. -first_beam=8.0 # beam for 1st-pass decoding in SAT. diff --git a/egs/chime5/s5/local/check_tools.sh b/egs/chime5/s5/local/check_tools.sh index ef2fe9d5e5d..ff2c53ea3d8 100755 --- a/egs/chime5/s5/local/check_tools.sh +++ b/egs/chime5/s5/local/check_tools.sh @@ -17,12 +17,19 @@ [ -f ./path.sh ] && . ./path.sh -uconv=`command -v uconv 2>/dev/null` \ +command -v uconv &>/dev/null \ || { echo >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; } -srilm=`command -v ngram 2>/dev/null` \ +command -v ngram &>/dev/null \ || { echo >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; } +if [ -z ${LIBLBFGS} ]; then + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; +fi + sox=`command -v sox 2>/dev/null` \ || { echo >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; } @@ -35,10 +42,10 @@ if [ ! -z "$sox" ]; then fi fi -phalign=`command -v phonetisaurus-align 2>/dev/null` \ +command -v phonetisaurus-align &>/dev/null \ || { echo >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; } -beamformit=`command -v BeamformIt 2>/dev/null` \ +command -v BeamformIt &>/dev/null \ || { echo >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; } exit 0 diff --git a/egs/chime5/s5/local/prepare_data.sh b/egs/chime5/s5/local/prepare_data.sh index c273b1b42f2..af37b0841fa 100755 --- a/egs/chime5/s5/local/prepare_data.sh +++ b/egs/chime5/s5/local/prepare_data.sh @@ -20,10 +20,26 @@ if [ $# -ne 3 ] ; then exit 1 fi +set -e -o pipefail + adir=$1 jdir=$2 dir=$3 +json_count=$(find $jdir -name "*.json" | wc -l) +wav_count=$(find $adir -name "*.wav" | wc -l) + +if [ "$json_count" -eq 0 ]; then + echo >&2 "We expect that the directory $jdir will contain json files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi +if [ "$wav_count" -eq 0 ]; then + echo >&2 "We expect that the directory $adir will contain wav files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi + echo "$0: Converting transcription to text" mkdir -p $dir diff --git a/egs/chime5/s5/local/prepare_dict.sh b/egs/chime5/s5/local/prepare_dict.sh index 468f2e915d0..e2a16b92f7d 100755 --- a/egs/chime5/s5/local/prepare_dict.sh +++ b/egs/chime5/s5/local/prepare_dict.sh @@ -87,7 +87,6 @@ awk '{print $1}' $dir/iv_lexicon.txt | \ if (!defined $seen{$w}) { print; } } ' $dir/word_counts > $dir/oov_counts.txt -set -x echo "*Highest-count OOVs (including fragments) are:" head -n 10 $dir/oov_counts.txt echo "*Highest-count OOVs (excluding fragments) are:" diff --git a/egs/chime5/s5/local/run_beamformit.sh b/egs/chime5/s5/local/run_beamformit.sh index 78f740339fa..176fd108d5d 100755 --- a/egs/chime5/s5/local/run_beamformit.sh +++ b/egs/chime5/s5/local/run_beamformit.sh @@ -25,11 +25,9 @@ odir=$2 array=$3 expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'` -if [ -z $BEAMFORMIT ] ; then - export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt +if ! command -v BeamformIt &>/dev/null ; then + echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1 fi -export PATH=${PATH}:$BEAMFORMIT -! hash BeamformIt && echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1 # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', @@ -75,7 +73,7 @@ for n in `seq $nj`; do cat << EOF > $expdir/log/beamform.$n.sh while read line; do $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ - --config_file `pwd`/conf/chime5.cfg \ + --config_file `pwd`/conf/beamformit.cfg \ --source_dir $sdir \ --result_dir $odir done < $output_wavfiles.$n diff --git a/egs/chime5/s5/local/train_lms_srilm.sh b/egs/chime5/s5/local/train_lms_srilm.sh index 09bba818ba6..8caa251fa35 100755 --- a/egs/chime5/s5/local/train_lms_srilm.sh +++ b/egs/chime5/s5/local/train_lms_srilm.sh @@ -40,20 +40,9 @@ tgtdir=$2 ##End of configuration loc=`which ngram-count`; if [ -z $loc ]; then - if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... - sdir=`pwd`/../../../tools/srilm/bin/i686-m64 - else - sdir=`pwd`/../../../tools/srilm/bin/i686 - fi - if [ -f $sdir/ngram-count ]; then - echo Using SRILM tools from $sdir - export PATH=$PATH:$sdir - else - echo You appear to not have SRILM tools installed, either on your path, - echo or installed in $sdir. See tools/install_srilm.sh for installation - echo instructions. - exit 1 - fi + echo >&2 "You appear to not have SRILM tools installed, either on your path," + echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it." + exit 1 fi # Prepare the destination directory @@ -122,7 +111,6 @@ else echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` fi -set -x # Kaldi transcript files contain Utterance_ID as the first word; remove it sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \ perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt @@ -247,7 +235,10 @@ if [ ! -z ${LIBLBFGS} ]; then ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\ sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1 else - echo "Skipping MaxEnt models" + echo >&2 "SRILM is not compiled with the support of MaxEnt models." + echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" + echo >&2 "which will take care of compiling the SRILM with MaxEnt support" + exit 1; fi From 196cb77a0b0ab9ccb4ab6a3ebe78471ef8b24a3d Mon Sep 17 00:00:00 2001 From: Shinji Watanabe Date: Fri, 9 Mar 2018 10:54:52 -0500 Subject: [PATCH 05/10] [egs] fixed chain related path issues and reflected Dan's comments --- egs/chime5/s5/local/chain/run_tdnn.sh | 2 +- .../chain/tuning/{run_tdnn_1e.sh => run_tdnn_1a.sh} | 10 ++++------ egs/chime5/s5/local/nnet3/run_ivector_common.sh | 2 +- egs/chime5/s5/run.sh | 4 ++-- 4 files changed, 8 insertions(+), 10 deletions(-) rename egs/chime5/s5/local/chain/tuning/{run_tdnn_1e.sh => run_tdnn_1a.sh} (97%) diff --git a/egs/chime5/s5/local/chain/run_tdnn.sh b/egs/chime5/s5/local/chain/run_tdnn.sh index 75da1a0a553..34499362831 120000 --- a/egs/chime5/s5/local/chain/run_tdnn.sh +++ b/egs/chime5/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1e.sh \ No newline at end of file +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh similarity index 97% rename from egs/chime5/s5/local/chain/tuning/run_tdnn_1e.sh rename to egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh index ba8779bcc77..7c599e9ee8a 100755 --- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1e.sh +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh @@ -1,8 +1,5 @@ #!/bin/bash -# 1e is as 1d but instead of the --proportional-shrink option, using -# the newly added xconfig-layer-specific 'l2-regularize' options. - # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -11,10 +8,10 @@ set -euo pipefail stage=0 nj=96 train_set=train_worn_u100k -test_sets="dev_worn eval_worn dev_beamformit_ref eval_beamformit_ref" +test_sets="dev_worn dev_beamformit_ref" gmm=tri3 nnet3_affix=_train_worn_u100k -lm_suffix=_chime5_tg +lm_suffix= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. @@ -62,6 +59,7 @@ fi # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ --train-set $train_set \ + --test-sets $test_sets \ --gmm $gmm \ --nnet3-affix "$nnet3_affix" || exit 1; @@ -137,7 +135,7 @@ if [ $stage -le 13 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) opts="l2-regularize=0.05" - output_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.01 bottleneck-dim=320" mkdir -p $dir/configs cat < $dir/configs/network.xconfig diff --git a/egs/chime5/s5/local/nnet3/run_ivector_common.sh b/egs/chime5/s5/local/nnet3/run_ivector_common.sh index 58f29f479bc..e28e5ce996d 100755 --- a/egs/chime5/s5/local/nnet3/run_ivector_common.sh +++ b/egs/chime5/s5/local/nnet3/run_ivector_common.sh @@ -10,7 +10,7 @@ set -euo pipefail stage=0 train_set=train_worn_u100k -test_sets="dev_worn eval_worn dev_beamformit_ref eval_beamformit_ref" +test_sets="dev_worn dev_beamformit_ref" gmm=tri3 nj=96 diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index 11824af2c31..a2c16a98bd3 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -31,7 +31,7 @@ audio_dir=${chime5_corpus}/audio train_set=train_worn_u100k test_sets="dev_worn dev_${enhancement}_ref" # use the below once you obtain the evaluation data. Also remove the comment #eval# in the lines below -#eval#test_sets="dev_worn eval_worn dev_${enhancement}_ref eval_${enhancement}_ref" +#eval#test_sets="dev_worn dev_${enhancement}_ref eval_${enhancement}_ref" # This script also needs the phonetisaurus g2p, srilm, beamformit ./local/check_tools.sh || exit 1 @@ -234,5 +234,5 @@ fi if [ $stage -le 18 ]; then # chain TDNN - local/chain/run_tdnn.sh --nj ${nj} --test_sets "$test_sets" + local/chain/run_tdnn.sh --nj ${nj} --train_set ${train_set}_cleaned --test_sets "$test_sets" --gmm tri3_cleaned --nnet3_affix _${train_set}_cleaned fi From 87970657763b5c7340300e209fdfa405340afe80 Mon Sep 17 00:00:00 2001 From: Shinji Watanabe Date: Fri, 9 Mar 2018 17:30:24 -0500 Subject: [PATCH 06/10] [egs] added location tags for future scoring. also changed the left and right channel information according to the other channel information format --- egs/chime5/s5/local/json2text.py | 14 ++++++++++---- egs/chime5/s5/local/prepare_data.sh | 25 ++++++++++++++----------- egs/chime5/s5/run.sh | 5 +++-- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/egs/chime5/s5/local/json2text.py b/egs/chime5/s5/local/json2text.py index a3b81fd7067..4df0160efb6 100755 --- a/egs/chime5/s5/local/json2text.py +++ b/egs/chime5/s5/local/json2text.py @@ -50,6 +50,12 @@ def hms_to_seconds(hms): else: mictype = args.mictype.upper() # convert from u01 to U01 + # add location tag for scoring (only for dev and eval sets) + if 'location' in x.keys(): + location = x['location'].upper() + else: + location = 'NOLOCATION' + start_time = x['start_time'][mictype] end_time = x['end_time'][mictype] @@ -69,10 +75,10 @@ def hms_to_seconds(hms): start_time = hms_to_seconds(start_time) end_time = hms_to_seconds(end_time) - if args.mictype == 'worn': - uttid = speaker_id + '_' + session_id + '-' + start_time + '-' + end_time - else: - uttid = speaker_id + '_' + session_id + '_' + mictype + '-' + start_time + '-' + end_time + uttid = speaker_id + '_' + session_id + if not args.mictype == 'worn': + uttid += '_' + mictype + uttid += '_' + location + '-' + start_time + '-' + end_time if end_time > start_time: sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8")) diff --git a/egs/chime5/s5/local/prepare_data.sh b/egs/chime5/s5/local/prepare_data.sh index af37b0841fa..a037f371e34 100755 --- a/egs/chime5/s5/local/prepare_data.sh +++ b/egs/chime5/s5/local/prepare_data.sh @@ -54,7 +54,7 @@ echo "$0: Creating datadir $dir for type=\"$mictype\"" if [ $mictype == "worn" ]; then # convert the filenames to wav.scp format, use the basename of the file - # as a the wav.scp key, add _L and _R for left and right channel + # as a the wav.scp key, add .L and .R for left and right channel # i.e. each file will have two entries (left and right channel) find $adir -name "S[0-9]*_P[0-9]*.wav" | \ perl -ne '{ @@ -64,17 +64,17 @@ if [ $mictype == "worn" ]; then @F = split "/", $path; ($f = $F[@F-1]) =~ s/.wav//; @F = split "_", $f; - print "${F[1]}_${F[0]}_L sox $path -t wav - remix 1 |\n"; - print "${F[1]}_${F[0]}_R sox $path -t wav - remix 2 |\n"; + print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n"; }' | sort > $dir/wav.scp # generate the transcripts for both left and right channel # from the original transcript in the form # P09_S03-0006072-0006147 gimme the baker # create left and right channel transcript - # P09_S03_L-0006072-0006147 gimme the baker - # P09_S03_R-0006072-0006147 gimme the baker - sed -n 's/ *$//; h; s/-/_L-/p; g; s/-/_R-/p' $dir/text.orig | sort > $dir/text + # P09_S03.L-0006072-0006147 gimme the baker + # P09_S03.R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text elif [ $mictype == "ref" ]; then # fixed reference array @@ -98,10 +98,10 @@ else # convert the transcripts from # P09_S03-0006072-0006147 gimme the baker # to the per-channel transcripts - # P09_S03_U01.CH1-0006072-0006147 gimme the baker - # P09_S03_U01.CH2-0006072-0006147 gimme the baker - # P09_S03_U01.CH3-0006072-0006147 gimme the baker - # P09_S03_U01.CH4-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker perl -ne '$l=$_; for($i=1; $i<=4; $i++) { ($x=$l)=~ s/-/.CH\Q$i\E-/; @@ -113,15 +113,18 @@ $cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist # Prepare 'segments', 'utt2spk', 'spk2utt' if [ $mictype == "worn" ]; then cut -d" " -f 1 $dir/text | \ - awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" \ > $dir/segments elif [ $mictype == "ref" ]; then cut -d" " -f 1 $dir/text | \ awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ sed -e "s/ P.._/ /" > $dir/segments else cut -d" " -f 1 $dir/text | \ awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ sed -e 's/ P.._/ /' > $dir/segments fi cut -f 1 -d ' ' $dir/segments | \ diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index a2c16a98bd3..5bb037efd9a 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -95,13 +95,14 @@ fi if [ $stage -le 5 ]; then # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text utils/fix_data_dir.sh data/train_worn # combine mix array and worn mics # randomly extract first 100k utterances from all mics - # If you want to include more training data, you can increase the number of array mic utterances + # if you want to include more training data, you can increase the number of array mic utterances utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06 utils/subset_data_dir.sh data/train_uall 100000 data/train_u100k utils/combine_data.sh data/${train_set} data/train_worn data/train_u100k @@ -111,7 +112,7 @@ if [ $stage -le 5 ]; then #eval#for dset in train dev eval; do for dset in train dev; do utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo - grep "_L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text + grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text utils/fix_data_dir.sh data/${dset}_worn done fi From d5d93fead3b949a16009f749e5a10be524a67f9f Mon Sep 17 00:00:00 2001 From: Jan Trmal Date: Sat, 10 Mar 2018 02:39:10 -0500 Subject: [PATCH 07/10] forward the test set names to ivector_common --- egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh | 1 + egs/chime5/s5/run.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh index 7c599e9ee8a..cb7cea9310d 100755 --- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh @@ -58,6 +58,7 @@ fi # nnet3 setup, and you can skip them by setting "--stage 11" if you have already # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ + --test-sets "$test_sets" \ --train-set $train_set \ --test-sets $test_sets \ --gmm $gmm \ diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index 5bb037efd9a..9a57289d592 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -235,5 +235,5 @@ fi if [ $stage -le 18 ]; then # chain TDNN - local/chain/run_tdnn.sh --nj ${nj} --train_set ${train_set}_cleaned --test_sets "$test_sets" --gmm tri3_cleaned --nnet3_affix _${train_set}_cleaned + local/chain/run_tdnn.sh --nj ${nj} --train-set ${train_set}_cleaned --test-sets "$test_sets" --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned fi From ffbe47eea3bb5bf7ccdbe151cdf129351435cba1 Mon Sep 17 00:00:00 2001 From: Shinji Watanabe Date: Sat, 10 Mar 2018 18:21:08 -0500 Subject: [PATCH 08/10] [egs] removed lexicon update --- .../s5/local/chain/tuning/run_tdnn_1a.sh | 4 +- egs/chime5/s5/local/prepare_dict.sh | 2 +- egs/chime5/s5/run.sh | 53 +++++-------------- 3 files changed, 17 insertions(+), 42 deletions(-) diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh index 7c599e9ee8a..cb063420593 100755 --- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh @@ -15,7 +15,7 @@ lm_suffix= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. -affix=1e # affix for the TDNN directory name +affix=1a # affix for the TDNN directory name tree_affix= train_stage=-10 get_egs_stage=-10 @@ -59,7 +59,7 @@ fi # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ --train-set $train_set \ - --test-sets $test_sets \ + --test-sets "$test_sets" \ --gmm $gmm \ --nnet3-affix "$nnet3_affix" || exit 1; diff --git a/egs/chime5/s5/local/prepare_dict.sh b/egs/chime5/s5/local/prepare_dict.sh index e2a16b92f7d..09083d0e795 100755 --- a/egs/chime5/s5/local/prepare_dict.sh +++ b/egs/chime5/s5/local/prepare_dict.sh @@ -24,7 +24,7 @@ set -o nounset # Treat unset variables as an error # check existing directories [ $# != 0 ] && echo "Usage: $0" && exit 1; -dir=data/local/dict_nosp +dir=data/local/dict mkdir -p $dir echo "$0: Getting CMU dictionary" diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index 5bb037efd9a..b586231f7e6 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -56,11 +56,11 @@ if [ $stage -le 2 ]; then local/prepare_dict.sh utils/prepare_lang.sh \ - data/local/dict_nosp "" data/local/lang_nosp data/lang_nosp + data/local/dict "" data/local/lang data/lang local/train_lms_srilm.sh \ --train-text data/train_worn/text --dev-text data/dev_worn/text \ - --oov-symbol "" --words-file data/lang_nosp/words.txt \ + --oov-symbol "" --words-file data/lang/words.txt \ data/ data/srilm fi @@ -68,7 +68,7 @@ LM=data/srilm/best_3gram.gz if [ $stage -le 3 ]; then # Compiles G for chime5 trigram LM utils/format_lm.sh \ - data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_test + data/lang $LM data/local/dict/lexicon.txt data/lang fi @@ -148,59 +148,34 @@ fi if [ $stage -le 9 ]; then # Starting basic training on MFCC features steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ - data/${train_set}_30kshort data/lang_nosp exp/mono + data/${train_set}_30kshort data/lang exp/mono fi if [ $stage -le 10 ]; then steps/align_si.sh --nj $nj --cmd "$train_cmd" \ - data/${train_set} data/lang_nosp exp/mono exp/mono_ali + data/${train_set} data/lang exp/mono exp/mono_ali steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 30000 data/${train_set} data/lang_nosp exp/mono_ali exp/tri1 + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 fi if [ $stage -le 11 ]; then steps/align_si.sh --nj $nj --cmd "$train_cmd" \ - data/${train_set} data/lang_nosp exp/tri1 exp/tri1_ali + data/${train_set} data/lang exp/tri1 exp/tri1_ali steps/train_lda_mllt.sh --cmd "$train_cmd" \ - 4000 50000 data/${train_set} data/lang_nosp exp/tri1_ali exp/tri2 + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 fi if [ $stage -le 12 ]; then - utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph for dset in ${test_sets}; do steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ - exp/tri2/graph_nosp data/${dset} exp/tri2/decode_${dset}_nosp + exp/tri2/graph data/${dset} exp/tri2/decode_${dset} done fi if [ $stage -le 13 ]; then - # create a more refined lexicon (include pronunciation probabilities) - steps/get_prons.sh --cmd "$train_cmd" \ - data/${train_set} data/lang_nosp exp/tri2 - - utils/dict_dir_add_pronprobs.sh --max-normalize true \ - data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ - exp/tri2/sil_counts_nowb.txt \ - exp/tri2/pron_bigram_counts_nowb.txt data/local/dict - - # add explicit phone loop for model - utils/lang/make_unk_lm.sh --use-pocolm false \ - data/local/dict exp/make_unk - - # and compile the lang directory - utils/prepare_lang.sh \ - --unk-fst exp/make_unk/unk_fst.txt \ - --phone-symbol-table data/lang_nosp/phones.txt \ - data/local/dict "" data/local/lang data/lang - - # and convert the LM in arpa to G.fst - utils/format_lm.sh \ - data/lang $LM data/local/dict/lexicon.txt data/lang -fi - -if [ $stage -le 14 ]; then utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph for dset in ${test_sets}; do steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ @@ -209,7 +184,7 @@ if [ $stage -le 14 ]; then wait fi -if [ $stage -le 15 ]; then +if [ $stage -le 14 ]; then steps/align_si.sh --nj $nj --cmd "$train_cmd" \ data/${train_set} data/lang exp/tri2 exp/tri2_ali @@ -217,7 +192,7 @@ if [ $stage -le 15 ]; then 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 fi -if [ $stage -le 16 ]; then +if [ $stage -le 15 ]; then utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph for dset in ${test_sets}; do steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ @@ -226,14 +201,14 @@ if [ $stage -le 16 ]; then wait fi -if [ $stage -le 17 ]; then +if [ $stage -le 16 ]; then # The following script cleans the data and produces cleaned data steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \ --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned fi -if [ $stage -le 18 ]; then +if [ $stage -le 17 ]; then # chain TDNN local/chain/run_tdnn.sh --nj ${nj} --train_set ${train_set}_cleaned --test_sets "$test_sets" --gmm tri3_cleaned --nnet3_affix _${train_set}_cleaned fi From 9b936cacf38abd404faca276b9d4d2266871b2c3 Mon Sep 17 00:00:00 2001 From: Shinji Watanabe Date: Sat, 10 Mar 2018 18:25:03 -0500 Subject: [PATCH 09/10] [egs] merge Yenda's update --- egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh index 8a6106292d5..cb063420593 100755 --- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh @@ -58,7 +58,6 @@ fi # nnet3 setup, and you can skip them by setting "--stage 11" if you have already # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ - --test-sets "$test_sets" \ --train-set $train_set \ --test-sets "$test_sets" \ --gmm $gmm \ From bc2f8f62cc0f3babdb13e71c0f6ef283e2612434 Mon Sep 17 00:00:00 2001 From: Shinji Watanabe Date: Tue, 13 Mar 2018 09:14:11 -0400 Subject: [PATCH 10/10] [egs] added RESULTS --- egs/chime5/s5/RESULTS | 15 +++++++++------ egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/egs/chime5/s5/RESULTS b/egs/chime5/s5/RESULTS index b57787a0798..941b63ece52 100644 --- a/egs/chime5/s5/RESULTS +++ b/egs/chime5/s5/RESULTS @@ -1,10 +1,13 @@ # tri2 -%WER 92.26 [ 60741 / 65835, 3212 ins, 35241 del, 22288 sub ] exp/tri2/decode_dev_beamformit_ref/wer_16_1.0 -%WER 76.47 [ 50342 / 65835, 4356 ins, 19004 del, 26982 sub ] exp/tri2/decode_dev_worn/wer_14_1.0 +%WER 76.40 [ 44985 / 58881, 3496 ins, 17652 del, 23837 sub ] exp/tri2/decode_dev_worn/wer_13_1.0 +%WER 93.56 [ 55091 / 58881, 2132 ins, 35555 del, 17404 sub ] exp/tri2/decode_dev_beamformit_ref/wer_17_1.0 # tri3 -%WER 92.43 [ 60852 / 65835, 3149 ins, 35536 del, 22167 sub ] exp/tri3/decode_dev_beamformit_ref.si/wer_17_1.0 -%WER 90.80 [ 59779 / 65835, 4742 ins, 27968 del, 27069 sub ] exp/tri3/decode_dev_beamformit_ref/wer_17_1.0 -%WER 76.38 [ 50283 / 65835, 3911 ins, 19081 del, 27291 sub ] exp/tri3/decode_dev_worn.si/wer_17_1.0 -%WER 73.13 [ 48146 / 65835, 4727 ins, 17274 del, 26145 sub ] exp/tri3/decode_dev_worn/wer_16_1.0 +%WER 72.81 [ 42869 / 58881, 3629 ins, 15998 del, 23242 sub ] exp/tri3/decode_dev_worn/wer_15_1.0 +%WER 91.73 [ 54013 / 58881, 3519 ins, 27098 del, 23396 sub ] exp/tri3/decode_dev_beamformit_ref/wer_17_1.0 + +# nnet3 tdnn+chain +%WER 47.91 [ 28212 / 58881, 2843 ins, 8957 del, 16412 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_worn/wer_9_0.0 +%WER 81.28 [ 47859 / 58881, 4210 ins, 27511 del, 16138 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref/wer_9_0.5 + diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh index cb063420593..45a7fd84bd6 100755 --- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh @@ -36,7 +36,7 @@ remove_egs=true reporting_email= #decode options -test_online_decoding=true # if true, it will run the last decoding stage. +test_online_decoding=false # if true, it will run the last decoding stage. # End configuration section.